In [2]:
import IPython.core.display as di
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
dataset = pd.read_csv ('adult.csv')

dataset.head(3)

Unnamed: 0,External Video Id,Golden Decision Str,Golden Policy Id,Final Golden Decision Str,Final Policy Id,Admin Decision Str,Admin Policy Id,Admin Ldap,Admin Review Date,Admin Quality Score,Original Quality Score,Post Calibration Admin Quality Score,Post Calibration Original Quality Score
0,tGKerC8PGMM,APPROVE RACY,2021,APPROVE RACY,2021,APPROVE RACY,2015,draindle,2018-08-01,1,1,1,100.00%
1,htespRHjg8Y,APPROVE,9008,APPROVE,9008,APPROVE,9008,kaabbott,2018-08-01,1,1,1,100.00%
2,J0bp1cgOUgs,APPROVE,9008,APPROVE,9008,APPROVE,9008,timothygraham,2018-07-31,1,0,1,0.00%


In [5]:
dataset = dataset[['Golden Decision Str', 'Final Policy Id', 'Post Calibration Admin Quality Score']]
dataset.head()

Unnamed: 0,Golden Decision Str,Final Policy Id,Post Calibration Admin Quality Score
0,APPROVE RACY,2021,1
1,APPROVE,9008,1
2,APPROVE,9008,1
3,STRIKE,2001,1
4,APPROVE,9008,0


In [6]:
def classify_score (number):
    if number == 0:
        return "0"
    else:
        return "1"

In [7]:
dataset ['Score_binary'] = dataset['Post Calibration Admin Quality Score'].apply(classify_score)


In [8]:
dataset.head()

Unnamed: 0,Golden Decision Str,Final Policy Id,Post Calibration Admin Quality Score,Score_binary
0,APPROVE RACY,2021,1,1
1,APPROVE,9008,1,1
2,APPROVE,9008,1,1
3,STRIKE,2001,1,1
4,APPROVE,9008,0,0


In [9]:
# Encoding the categorical variables
dataset = pd.get_dummies(dataset, columns= ["Golden Decision Str","Final Policy Id"], prefix= ["str","id"])

In [10]:
dataset.head(3)

Unnamed: 0,Post Calibration Admin Quality Score,Score_binary,str_APPROVE,str_APPROVE PARENTAL GUIDANCE,str_APPROVE RACY,str_FORCE PRIVATE VIDEOS,str_REJECT,str_STRIKE,str_TERMINATE USER,id_1002,...,id_5012,id_6001,id_6003,id_6048,id_6054,id_7001,id_7003,id_7008,id_9008,id_20121
0,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
# creating matrix of features to identify the dependent factor vs independent factors (parameters)
# Avoid the dummy variable trap is done by dropping 1 dummy variable from each category. Some libraries do it for you 
X = dataset.iloc[:, [2,3,4,5,6,7]].values  
y = dataset.iloc[:, 1].values

In [32]:
y

array(['1', '1', '1', ..., '1', '0', '0'], dtype=object)

In [12]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0) # you can ommit random_state. It 
# keeps the same random sample for all sampling trials OR you can choose different value.



In [13]:
# Feature Scaling. This for bringing large and small values of different parameters on the same scale.Not needed for this dataset
# Here we will use Standardisation Scale.
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train) # X_train needs for fit and transform
X_test = sc_X.transform(X_test)       # X_test needs for only transform

# sc_y = StandardScaler()
# y_train = sc_y.fit_transform(y_train)
# y_test = sc_y.transform(y_test)"""

'from sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train = sc_X.fit_transform(X_train) # X_train needs for fit and transform\nX_test = sc_X.transform(X_test)       # X_test needs for only transform\n\n# sc_y = StandardScaler()\n# y_train = sc_y.fit_transform(y_train)\n# y_test = sc_y.transform(y_test)'

In [14]:
# A) Fitting Logistic Regression to the Training set
# Creating logestic regression class (classifier)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 622, 2305],
       [ 468, 8077]], dtype=int64)

In [16]:
# Applying k-Fold Cross Validation (evaluating the model performance)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score (estimator = classifier, X = X_train, y = y_train, cv = 10)
print (accuracies)
print (accuracies.mean ())
print (accuracies.std ())

[0.75522648 0.76045296 0.75087108 0.75348432 0.7576286  0.75675676
 0.7489102  0.75850044 0.7504363  0.7399651 ]
0.7532232228904059
0.0056900975044570715


In [17]:
# B) Fitting KNN to the Training set
# Creating KNN class (classifier)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1640, 1287],
       [2892, 5653]], dtype=int64)

In [19]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score (estimator = classifier, X = X_train, y = y_train, cv = 10)
print (accuracies)
print (accuracies.mean ())
print (accuracies.std ())

[0.74738676 0.72560976 0.75       0.62108014 0.74542284 0.75588492
 0.74367916 0.73496077 0.7513089  0.72774869]
0.7303081936300411
0.03763365408884896


In [20]:
# C) Fitting SVC to the Training set. The assumption is that the data is linearly separable.
# Creating SVC class (classifier)
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 622, 2305],
       [ 468, 8077]], dtype=int64)

In [22]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score (estimator = classifier, X = X_train, y = y_train, cv = 10)
print (accuracies)
print (accuracies.mean ())
print (accuracies.std ())

[0.75522648 0.76045296 0.75087108 0.75348432 0.7576286  0.75675676
 0.7489102  0.75850044 0.7504363  0.7399651 ]
0.7532232228904059
0.0056900975044570715


In [23]:
# D) Fitting kernel SVC to the Training set. The assumption is that the data is NOT-linearly separable.
# Creating kernel SVC class (classifier)
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 622, 2305],
       [ 468, 8077]], dtype=int64)

In [25]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score (estimator = classifier, X = X_train, y = y_train, cv = 10)
print (accuracies)
print (accuracies.mean ())
print (accuracies.std ())

[0.75522648 0.76045296 0.75087108 0.75348432 0.7576286  0.75675676
 0.7489102  0.75850044 0.7504363  0.7399651 ]
0.7532232228904059
0.0056900975044570715


In [26]:
# E) Fitting Naive Bayes to the Training set. The assumption is that the parameters are independant.
# Creating Naive Bayes class (classifier)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB() # naive bayes has no arguements
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [27]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 911, 2016],
       [ 950, 7595]], dtype=int64)

In [28]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score (estimator = classifier, X = X_train, y = y_train, cv = 10)
print (accuracies)
print (accuracies.mean ())
print (accuracies.std ())

[0.73954704 0.74041812 0.73519164 0.73519164 0.74542284 0.72711421
 0.75065388 0.74106364 0.7478185  0.73036649]
0.7392788000487549
0.007109430854283707


In [None]:
# Applying Grid Search to find the best model and the best parameters. Here is just the code. Excuting the code on many
# independent variables is time and computation comsuming. We will do it after dimentionality reduction.
"""from sklearn.model_selection import GridSearchCV
parameters = [{"C": [1000, 1500, 2000, 3000], 'kernel': ['linear']},
              {"C": [1000, 1500, 2000, 3000], 'kernel': ['rbf'], 'gamma': [0.05, 0.01, 0.02, 0.03, 0.04]}] #if you get 0.5 as the best
                                                    # then exchange 0.001 and 0.0001 by 0.2, 0.3, 0.4, 0.5,0.6,0.7,0.8,and 0.9
grid_search = GridSearchCV (estimator = classifier,
                            param_grid = parameters,
                            scoring = 'accuracy',
                            cv = 10,
                            n_jobs = -1)
grid_search = grid_search.fit (X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_ """

In [None]:
# print (best_accuracy)
# print (best_parameters)

In [30]:
import statsmodels.api as sm

In [31]:
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary())

TypeError: '>=' not supported between instances of 'str' and 'int'