In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("google_review_ratings.csv", index_col = 'User')


In [3]:
drop_col = []
for col in data.columns:
    if data[col].notna().sum() < len(data)*0.2:  #if values in a column is less than 20% of entries
        print("dropping column:- ", col)
        data.drop(columns=col, inplace=True)

dropping column:-  Unnamed: 25


In [4]:
# We do not have a target column. We will create this column based on user ratings in gym
# We assume if a person has rated more than 3, he will take the membership

data['member'] = data['Category 19'].map(lambda a: 0.0 if a<3 else 1.0)  # 0 means not joined, 1 means joined the membership

In [5]:
data.drop(index=['User 2713'], inplace=True)  # dropping since the Category 11 value at index 2713 is a wierd string

data['Category 11'] = data['Category 11'].astype('float64')  # changing the datatype of Category 11 from object to float64


In [6]:
correlation_data = data.corr()

In [7]:
correlation_data.loc['Category 19']['member']   #Very high correlation since target data is derived from it

0.9081093707423153

* There is a very high correlation of Category 19 with membership. About 90%
* Prediction will be very biased if this data is used
* we will drop Category 19 since target feature (member) is derived from it 

In [8]:
data.drop(columns=['Category 19'], inplace=True)
data.corr()['member']

Category 1     0.113006
Category 2     0.024852
Category 3    -0.046621
Category 4    -0.128938
Category 5    -0.181834
Category 6    -0.179455
Category 7    -0.204397
Category 8    -0.189816
Category 9    -0.183263
Category 10   -0.208952
Category 11   -0.124995
Category 12   -0.011600
Category 13    0.022885
Category 14    0.065157
Category 15    0.035950
Category 16    0.039403
Category 17    0.230967
Category 18    0.318253
Category 20    0.185782
Category 21    0.107919
Category 22    0.024286
Category 23    0.015564
Category 24    0.052283
member         1.000000
Name: member, dtype: float64

* Now none of the data shows a good co-relation with membership. Now we will check if we can derive a good 

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [16]:
feature_data = data.drop(columns=['member'])
target_data = data.member
trainX, testX, trainY, testY = train_test_split(feature_data, target_data, random_state=42)

In [18]:
models = {"Random Forest": RandomForestClassifier(), "Logistic Regression": LogisticRegression(), "Naive Bayes": GaussianNB()}

In [20]:
for name, model in models.items():
    pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), model)
    pipeline.fit(trainX,trainY)
    print(name, pipeline.score(testX, testY))

Random Forest 0.968475073313783
Logistic Regression 0.9230205278592375
Naive Bayes 0.8262463343108505


* Since we did not had the target column, we derived it from Ratings of gym i.e Category 19
* Now we needed to drop Category 19 since the target column was derived from it otherwise it would highly effect our predictions
* Used Simpleimputer to fill in some of the nan values in 2 columns
* Used different models to find out the best score
* We find that we are getting about 97 % accuracy from Random Forest