In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import  BernoulliNB
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
gender = pd.read_csv('Transformed Data Set - Sheet1.csv')
gender.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


### Grouping the categories

In [3]:
gender_train, gender_test = train_test_split(gender, test_size = 0.3, random_state=42)


In [4]:
col_to_feature = gender_train.columns[1:-1]
col_to_feature.tolist()

per_to_use = [0.1, 0.15, 0.2]
new_dict = dict(zip(col_to_feature, per_to_use))

In [5]:
def find_non_rare_labels(df, variable, tolerance):
    temp = df[variable].value_counts(normalize=True)
    labels = [i for i in temp.loc[temp >tolerance ].index.values]
    return labels

In [6]:
def rare_encoding(train,test, variable,tolerance):
    frequent_cat = find_non_rare_labels(train, variable, tolerance)
    train[variable] = np.where(train[variable].isin (frequent_cat), train[variable], 'Other')
    test[variable] = np.where(test[variable].isin (frequent_cat), test[variable], 'Other')
    
    return train, test

In [7]:
for col in new_dict:
    gender_train, gender_test= rare_encoding(gender_train, gender_test , col,new_dict[col])

In [8]:
def find_category_mappings(df, variable, target):

    tmp = pd.DataFrame(df.groupby([variable])[target].mean())
    
    tmp['non-target'] = 1 - tmp[target]
    
    tmp['ratio'] = np.log(tmp[target] / tmp['non-target'])
    tmp['ratio'] = np.where(tmp['ratio'].isin([np.inf , -np.inf]), tmp['ratio'].dropna().sample(random_state=42), tmp['ratio'])

    label_dict = tmp['ratio'].to_dict()
    return label_dict

In [9]:
def integer_encode(df, variable, ordinal_mapping):

    df[variable] = df[variable].map(ordinal_mapping)
#     test[variable] = test[variable].map(ordinal_mapping)

In [10]:
gender_train['Gender'] = np.where(gender_train['Gender'] == 'F', 1, 0)
gender_test['Gender'] = np.where(gender_test['Gender'] == 'F', 1, 0)

In [11]:
new_col = gender.columns[:-1]
for col in new_col:
    label_dict = find_category_mappings(gender_train, col, 'Gender')
    integer_encode(gender_train, col, label_dict)

In [12]:
gender_train

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
6,-0.068993,1.203973,0.693147,0.087011,1
28,-0.068993,1.203973,-0.405465,0.087011,1
4,-0.068993,-0.287682,-0.287682,0.087011,1
48,-0.068993,-0.287682,-0.405465,-0.154151,0
36,0.693147,-0.287682,-0.405465,0.087011,0
19,-0.068993,1.203973,-0.405465,-0.154151,1
56,-0.287682,-0.587787,-0.154151,-0.405465,0
59,-0.068993,1.203973,-0.287682,-0.154151,0
58,-0.287682,-0.587787,-0.287682,0.087011,0
50,-0.068993,-0.587787,-0.405465,0.087011,0


In [13]:
for col in new_col:
    label_dict = find_category_mappings(gender_test, col, 'Gender')
    integer_encode(gender_test, col, label_dict)

In [14]:
gender_test

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
54,-0.510826,-0.693147,-1.098612,0.0,0
62,-0.510826,-0.510826,-1.098612,0.223144,0
0,-0.510826,1.386294,0.0,0.287682,1
45,-0.510826,-0.510826,-1.098612,0.223144,0
5,1.94591,-0.510826,-1.098612,0.0,1
63,-1.098612,-0.510826,-1.098612,0.0,0
16,-0.510826,1.098612,1.94591,0.287682,1
12,1.94591,1.098612,1.94591,0.287682,1
65,-0.510826,-0.693147,-1.098612,0.223144,0
30,1.94591,1.386294,1.94591,0.223144,1


In [15]:
# gender_test['Favorite Beverage'] = np.where(gender_test['Favorite Beverage'].isin([-np.inf, np.inf]), gender_test['Favorite Beverage'].dropna().sample(), gender_test['Favorite Beverage'] )

In [16]:
# gender_test[gender_test['Favorite Beverage'].isin([-np.inf, np.inf])]

In [17]:
def X_y(df, col):
    X = df.drop(col, axis=1)
    y = df[col]
    return X, y

In [18]:
X_train, y_train = X_y(gender_train, 'Gender')
X_test, y_test = X_y(gender_test, 'Gender')

In [19]:
rf = RandomForestClassifier(random_state=42)
los = LogisticRegression()
sv  = SVC()
nb = BernoulliNB()
model_col = [rf, los, sv,nb]

In [26]:
train= []
test = []
col_model = ['random_forest', 'Logistic_Regression', 'Suport_vector', 'Naivebayes']
for model, col in zip(model_col, col_model):
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    train.append(train_score)
    test.append(test_score)
#     annot_kws = {"ha": 'right',"va": 'center'}
#     sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, annot_kws=annot_kws)
#     plt.title(f'{col} , train_score:{train_score}, test_score: {test_score} ')
#     plt.show()
    print(f'{col} , train_score:{train_score}, test_score: {test_score} ')
#     print('\n')
    print(confusion_matrix(y_test, y_pred))
    print('\n\n\n')

random_forest , train_score:0.9130434782608695, test_score: 0.7 
[[6 3]
 [3 8]]




Logistic_Regression , train_score:0.717391304347826, test_score: 0.7 
[[6 3]
 [3 8]]




Suport_vector , train_score:0.6956521739130435, test_score: 0.85 
[[ 6  3]
 [ 0 11]]




Naivebayes , train_score:0.717391304347826, test_score: 0.7 
[[6 3]
 [3 8]]






In [36]:
pipe = Pipeline([('classifier' , RandomForestClassifier())])
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20)},
#     'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : [1, 2, 3]}
]

In [37]:
clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

In [38]:
best_model = clf.fit(X_train, y_train)

Fitting 5 folds for each of 70 candidates, totalling 350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:    4.8s finished


In [39]:
best_model.best_estimator_.get_params()['classifier']

LogisticRegression(C=29.763514416313132, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
best_model.score(X_test, y_test)

0.75

In [41]:
best_model.score(X_train, y_train)

0.6956521739130435

In [46]:
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=3)
grid_search.get_params()

{'cv': 3,
 'error_score': 'raise-deprecating',
 'estimator__C': 1.0,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'auto_deprecated',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
     kernel='rbf', max_iter=-1, probability=False, random_state=None,
     shrinking=True, tol=0.001, verbose=False),
 'iid': 'warn',
 'n_jobs': None,
 'param_grid': {'C': [0.001, 0.01, 0.1, 1, 10],
  'gamma': [0.001, 0.01, 0.1, 1]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [47]:
grid_search.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'gamma': [0.001, 0.01, 0.1, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [48]:
grid_search.score(X_train,y_train)

0.6739130434782609

In [49]:
grid_search.score(X_test,y_test)

0.7

In [None]:
new = ['Cool', 'Rock', 'Wine', 'Coca Cola/Pepsi']