### Randome Forest Classifier

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import seaborn as sns
import calendar as cl
from datetime import datetime
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
#Accuracy
from sklearn import metrics
from imblearn.over_sampling import SMOTE

In [2]:
def month_converter(month):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    return months.index(month) + 1


def correlation_heatmap(train):
    correlations = train.corr()

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
    plt.show();

    
# convert the given values to binary encoding and return the dataframe

def label_encoder(dataset, names: list):
    #label encoding weekend type
    df = pd.DataFrame(dataset)
    labelencoder = LabelEncoder()
    for name in names: 
        df[name] = labelencoder.fit_transform(df[name])
        
    return df

In [3]:
#import dataset
shoping_dataset = pd.read_csv('online_shoppers_intention.csv')


#Converting the month variable to categorical
shoping_dataset['Month'] = shoping_dataset['Month'].apply(lambda x: month_converter(x))

#Make the copy of the dataset so, it might be needed at some point in time.
shop_dataset_copy = shoping_dataset.copy()

# Analyze the data set
display(shoping_dataset.head(3))

# get columns details
display(shoping_dataset.columns)

display(shoping_dataset.describe())

# so far it has been known that the columns Operating system, Browser, Region and Traffic Type has no value in data, 
# so I prefer removing it.

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,Returning_Visitor,False,False


Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend', 'Revenue'],
      dtype='object')

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,7.651987,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,3.392841,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,5.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,7.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157213,0.016813,0.05,0.0,0.0,11.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,12.0,8.0,13.0,9.0,20.0


In [4]:
shoping_dataset[['Weekend','Revenue']] = label_encoder(shoping_dataset[['Weekend','Revenue']], ['Weekend','Revenue'])
shoping_dataset[['VisitorType']] = label_encoder(shoping_dataset[['VisitorType']], ['VisitorType'])
# Here we will One hot encode the weekend because we have more to two variables to show i.e. new visitor, return visitor and others
onehotencoder = OneHotEncoder()
onehotencoded = pd.DataFrame(onehotencoder.fit_transform(shoping_dataset[['VisitorType']]).toarray(), 
                             columns=onehotencoder.get_feature_names(['VisitorType']))

shoping_dataset = shoping_dataset.drop(columns='VisitorType')
shoping_dataset = shoping_dataset.join(onehotencoded)

In [5]:
X = shoping_dataset.drop(columns=['Revenue'])
y = shoping_dataset['Revenue']


In [8]:
sm = SMOTE(random_state=27, sampling_strategy=1.0) # sampling strategy is how much ratio of minority we want to the majority
X_train_smote, y_train_smote = sm.fit_sample(X, y)


In [10]:
rf = RandomForestClassifier(n_estimators=50, criterion='entropy', random_state=42)

cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_iris = cross_val_score(rf, X_train_smote, y_train_smote, cv=cross_val, scoring='f1')
accuracy_iris.mean()

0.9359172413394271