In [8]:
from src import load
from src.load import read_data
from src.load import read_test
import numpy as np
import pandas as pd
import pickle
from src.kir_transformers import cv
from src.kir_models import lr
from src.kir_transformers import tfidf_new
from src.kir_models import metrics
from src.kir_models import featureEngineering
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

In [6]:
# loaded load.py
train_X, val_X, train_y, val_y = read_data()
test_X = read_test('test_no_label.csv')

In [7]:
train_X['review'].fillna('', inplace=True)
test_X['review'].fillna('', inplace=True)

In [9]:
# loading kir_transformers.py for tfidf vectorizer
params = {'max_features': 1000}
tf_vectorizer = tfidf_new(train_X['review'], params)

In [11]:
# tranform both train and val - TFIDF
train_vec = tf_vectorizer.transform(train_X['review'])
test_vec = tf_vectorizer.transform(test_X['review'])

In [28]:
train_vec_df = pd.DataFrame.sparse.from_spmatrix(train_vec)
test_vec_df = pd.DataFrame.sparse.from_spmatrix(test_vec)

In [13]:
# addition of columns for feature Engg function
train_vec_df['date'] = train_X['date']
test_vec_df['date'] = test_X['date']

train_vec_df['user_id'] = train_X['user_id']
test_vec_df['user_id'] = test_X['user_id']

train_vec_df['rating'] = train_X['rating']
test_vec_df['rating'] = test_X['rating']

In [16]:
# pass through the new feature Engg function and drop columns that can't be fed to the LR model
train_X_new, fake_users = featureEngineering(train_X, train_y)
train_X_new.drop(['date', 'user_id', 'rating'], inplace=True, axis =1)
test_X_new, fake_users = featureEngineering(test_X, None, fake_users)
test_X_new.drop(['date', 'user_id', 'rating'], inplace=True, axis =1)

In [17]:
train_X_new

Unnamed: 0,ex_id,prod_id,review,rating_indicator,previous_fake,reviews_today
0,0,0,The food at snack is a selection of popular Gr...,0,1,2
1,1,0,This little place in Soho is wonderful I had a...,0,1,1
2,2,0,ordered lunch for from Snack last Friday On ti...,0,1,2
3,3,0,This is a beautiful quaint little restaurant o...,0,1,1
4,4,0,Snack is great place for a casual sit down lun...,0,1,1
...,...,...,...,...,...,...
250869,358950,349,Made a reservation for an early dinner Saturda...,1,0,2
250870,358951,349,Emily is like Franny s Marco but with warmth a...,1,0,1
250871,358953,349,Can t say enough good things about this place ...,1,0,1
250872,358954,349,Had a great dinner here fantastic pizza the s ...,1,0,1


In [18]:
# addition of columns for feature Engg function
train_vec_df['rating_indicator'] = train_X_new['rating_indicator']
test_vec_df['rating_indicator'] = test_X_new['rating_indicator']

train_vec_df['previous_fake'] = train_X_new['previous_fake']
test_vec_df['previous_fake'] = test_X_new['previous_fake']

train_vec_df['reviews_today'] = train_X_new['reviews_today']
test_vec_df['reviews_today'] = test_X_new['reviews_today']

In [19]:
train_vec_df.drop(['date', 'user_id', 'rating'], inplace=True, axis =1)
test_vec_df.drop(['date', 'user_id', 'rating'], inplace=True, axis =1)

In [20]:
train_vec_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,993,994,995,996,997,998,999,rating_indicator,previous_fake,reviews_today
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,2
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
2,0.0,0.0,0.0,0.0,0.0,0.411024,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,2
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250869,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,2
250870,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1
250871,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1
250872,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1


In [21]:
# fit the best hyperparameter configuration to LR - TFIDF
p_best_tf = {'C':0.1, 'penalty': 'l2', 'verbose':10, 'solver':'liblinear'}
model = lr(p_best_tf)
# model.fit(train_vec_df, train_y['label'])
model.fit(train_vec_df, train_y['label'])

[LibLinear]

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001,
                   verbose=10, warm_start=False)

In [22]:
y_prob = model.predict_proba(test_vec_df)[:,1]

In [23]:
np.savetxt('predictions.csv' ,y_prob)