<a href="https://colab.research.google.com/github/YasmineJiang/codespace/blob/main/Web_Analytics_Model_Deployment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline

class UserPredictor():
    def __init__(self):

        self.model = Pipeline([("pf", PolynomialFeatures(degree=2, include_bias=False)),("lr", LogisticRegression())
])


    def data_manipulation(self,users,logs):

        #Feature columns‘ data preparation:
        #Turn the categorical columns in "train_users" to numerical data
        feature_1 = users[['user_id','age','past_purchase_amt','badge']]
        oh = OneHotEncoder()
        badge_df = pd.DataFrame(oh.fit_transform(feature_1[['badge']]).toarray(), columns=oh.get_feature_names_out())
        feature_user = pd.concat([feature_1[["user_id","age","past_purchase_amt"]], badge_df],axis=1)

        #Genrate new feature 'avg' in "train_logs" dataset
        #Computing 'avg' for user_id that have visited /laptop url at least once
        logs.reset_index(drop=True)
        df_grouped = pd.DataFrame()
        df_grouped['avg'] = pd.DataFrame(logs[logs['url'] == '/laptop.html'].groupby(['user_id','url'])['seconds'].mean())
        df_grouped.reset_index()

        #Replace NAN with 0 to make sure no missing data in our dataset
        logs['url'] = '/laptop.html'
        df_avg = pd.merge(logs,df_grouped,how='left',on=['user_id','url'])
        values = {'avg':0}
        df_avg.fillna(value=values,inplace=True)
        feature_logs = df_avg[['user_id','url','avg']].drop_duplicates()

        #Merge "feature_logs" with "feature_users" and replace NAN with 0
        all_feature = pd.merge(feature_user,feature_logs[['user_id','avg']],how='left',on='user_id')
        all_feature.fillna(value=values,inplace=True)
        all_feature.set_index('user_id',inplace=True)

        #Standardize the feature dataset
        scaler = StandardScaler()
        scaler.fit(all_feature)
        all_feature_trans = pd.DataFrame(scaler.transform(all_feature),columns=list(all_feature.columns))

        return all_feature_trans


    def fit(self,train_users,train_logs,train_y):
        users = train_users
        logs = train_logs

        train_final_df = pd.concat([self.data_manipulation(users,logs),train_y],axis=1)
        #Split dataset into train & test
        xcols = ['age','past_purchase_amt','badge_bronze','badge_gold','badge_silver','avg']
        ycol = 'y'
        train, test = train_test_split(train_final_df)

        self.model.fit(train[xcols], train[ycol])
        self.model.predict((test[xcols]))

    def predict(self,test_users,test_logs):

        users = test_users
        logs = test_logs

        #Split dataset into train & test
        xcols = ['age','past_purchase_amt','badge_bronze','badge_gold','badge_silver','avg']
        ycol = 'y'
        return self.model.predict((self.data_manipulation(users,logs)))

        #return self.model.predict((test_final_df[xcols]))