In [1]:
# load pandas, sklearn, and pyspark types and functions
import pandas as pd
from sklearn.linear_model import LogisticRegression
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *

In [2]:
# load the CSV as a Spark data frame
pandas_df = pd.read_csv("https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv")
spark_df = spark.createDataFrame(pandas_df)

# assign a user ID and a partition ID using Spark SQL
spark_df.createOrReplaceTempView("spark_df")
spark_df = spark.sql("""
select *, user_id%10 as partition_id 
from (
  select *, row_number() over (order by rand()) as user_id
  from spark_df
) 
""")

# preview the results
display(spark_df)

G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,label,user_id,partition_id
0,0,0,0,0,0,0,0,0,1,0,1,1
0,0,0,0,0,0,0,1,0,0,0,2,2
0,0,0,0,0,0,1,1,0,0,0,3,3
0,0,0,0,1,1,0,0,1,1,1,4,4
1,0,0,0,0,1,0,0,0,0,0,5,5
0,0,0,0,1,1,1,0,0,0,0,6,6
0,0,0,0,1,0,1,1,0,0,0,7,7
1,0,0,0,0,0,1,1,0,0,0,8,8
1,0,1,0,0,1,0,0,0,0,0,9,9
0,0,1,0,0,0,1,1,0,1,1,10,0


In [3]:
# train a model, but first, pull everything to the driver node
df = spark_df.toPandas().drop(['user_id', 'partition_id'], axis = 1)

y_train = df['label']
x_train = df.drop(['label'], axis=1)

# use logistic regression
model = LogisticRegression()
model.fit(x_train, y_train)

In [4]:
# pull all data to the driver node
sample_df = spark_df.toPandas()

# create a prediction for each user 
ids = sample_df['user_id']
x_train = sample_df.drop(['label', 'user_id', 'partition_id'], axis=1)
pred = model.predict_proba(x_train)
result_df = pd.DataFrame({'user_id': ids, 'prediction': pred[:,1]})

# display the results 
display(spark.createDataFrame(result_df))

prediction,user_id
0.381266330124349,1
0.0642218814304239,2
0.0453886169365076,3
0.6363722956203243,4
0.0571984380520558,5
0.0833672464695164,6
0.062407624115546,7
0.0298428077928553,8
0.0827002705509501,9
0.280283650512675,10


In [5]:
# define a schema for the result set, the user ID and model prediction
schema = StructType([StructField('user_id', LongType(), True),
                     StructField('prediction', DoubleType(), True)])  

# define the Pandas UDF 
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def apply_model(sample_pd):

    # run the model on the partitioned data set 
    ids = sample_df['user_id']
    x_train = sample_df.drop(['label', 'user_id', 'partition_id'], axis=1)
    pred = model.predict_proba(x_train)

    return pd.DataFrame({'user_id': ids, 'prediction': pred[:,1]})
  
# partition the data and run the UDF  
results = spark_df.groupby('partition_id').apply(apply_model)
display(results)   

user_id,prediction
1,0.381266330124349
2,0.0642218814304239
3,0.0453886169365076
4,0.6363722956203243
5,0.0571984380520558
6,0.0833672464695164
7,0.062407624115546
8,0.0298428077928553
9,0.0827002705509501
10,0.280283650512675
