In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#plt.style.use("fivethirtyeight")

### data set imports

In [2]:
#from sklearn import svm, datasets      #this imports the datasets in scikit learn

In [3]:
#iris_df = sns.load_dataset('iris') #using sklearn import instead

In [4]:
titanic_df = sns.load_dataset('titanic')

In [5]:
flights_df = sns.load_dataset('flights')

In [6]:
exercise_df = sns.load_dataset('exercise')

In [7]:
planets_df = sns.load_dataset('planets')

In [8]:
tips_df = sns.load_dataset('tips')

In [9]:
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [10]:
tips_df.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

### create dummy variables

In [11]:
new_df = pd.get_dummies(tips_df, drop_first=True)

In [12]:
new_df

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,1,1,0,0,1,1
1,10.34,1.66,3,0,1,0,0,1,1
2,21.01,3.50,3,0,1,0,0,1,1
3,23.68,3.31,2,0,1,0,0,1,1
4,24.59,3.61,4,1,1,0,0,1,1
5,25.29,4.71,4,0,1,0,0,1,1
6,8.77,2.00,2,0,1,0,0,1,1
7,26.88,3.12,4,0,1,0,0,1,1
8,15.04,1.96,2,0,1,0,0,1,1
9,14.78,3.23,2,0,1,0,0,1,1


### define X and y

In [13]:
feature_cols = new_df.drop('tip', axis=1).copy()
X = feature_cols
y = new_df['tip']

### train test split (TTS)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## modelling

- create your model
- cross val score (note, this can be performed for all models, though we only show it for LR below)
- grid search cv (notes this can be performed for all models, though we only show it for LR below)

### linear regression

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
lr = LinearRegression() #instantiates the model
lr.fit(X_train, y_train) #fit the model on the training data
preds = lr.predict(X_test) #predicts the y's

In [18]:
from sklearn.metrics import mean_squared_error

In [19]:
y_true = y_test
y_pred = preds
mean_squared_error(y_true, y_pred) #the smaller the MSE the better 

1.567333511019207

In [20]:
lr.score(X_test, y_test) #this gives you R2 or model fit (the larger the better)

0.3843680849798467

In [21]:
print(lr.coef_)
lr.intercept_

[ 0.07674251  0.28146603  0.05647344  0.13380634  0.14587461 -0.23063252
 -0.07113096  0.06906499]


0.6202445961332326

### cross val score (CVS)

In [40]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [41]:
X = X_train
y = y_train
print(cross_val_score(lr, X, y, cv=10))

#this gives the score(R2) for the kfold where cv=10
#take the avg of the cross val scores and this will tell you how well you will do

[0.2297965  0.4751062  0.34282206 0.34097773 0.38575596 0.16506057
 0.39566901 0.33120838 0.13211282 0.12308324]


In [43]:
kf = KFold(n_splits=10, shuffle=True)

X = X_train
y = y_train
print(cross_val_score(lr, X, y, cv=kf))

#this gives the score(R2) for the kfold where cv=the kfold
#take the avg of the cross val scores and this will tell you how well you will do

[0.3083708  0.50954473 0.40740144 0.3843264  0.29668046 0.13333085
 0.10351469 0.40284379 0.2297931  0.25329899]


### grid search cv (GSCV)

In [45]:
from sklearn.model_selection import GridSearchCV

In [416]:
params = {'fit_intercept': (True, False), 'normalize': (True, False)}

In [417]:
gscv = GridSearchCV(lr, params) #instantiates the grid search

In [339]:
gscv.fit(X_train, y_train) #fits and cycles through model and the params that we passed in

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'fit_intercept': (True, False), 'normalize': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [340]:
gscv.best_estimator_ #this gives the best model paramaters for an optimal outcome or best model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [341]:
lr = gscv.best_estimator_ #instantiates the best model
lr.fit(X_train, y_train) #fit the model on the training data
preds = lr.predict(X_test) #predicts the y's

In [342]:
y_true = y_test
y_pred = preds
mean_squared_error(y_true, y_pred) #the smaller the MSE the better 

0.8018779635581809

In [343]:
lr.score(X_test, y_test) #this gives you R2 or model fit (the larger the better)

0.5030541469855027

In [344]:
print(lr.coef_) 
lr.intercept_

#note the normalize=True is like a z score, which subtracts the mean from every data point and divides it by the variance

#and of course intercept=False which means that there is no required intercept, is zero

[ 0.09777709  0.16516675  0.01388336  0.23793707  0.11526313 -0.09760914
 -0.19800753 -0.04252873]


0.5801325018588104

### Logistic Reression

In [33]:
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [34]:
new_df2 = pd.get_dummies(titanic_df, drop_first=True)

In [35]:
new_df2

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_male,embarked_Q,...,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Queenstown,embark_town_Southampton,alive_yes
0,0,3,22.0,1,0,7.2500,True,False,1,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,38.0,1,0,71.2833,False,False,0,0,...,1,0,1,0,0,0,0,0,0,1
2,1,3,26.0,0,0,7.9250,False,True,0,0,...,1,0,0,0,0,0,0,0,1,1
3,1,1,35.0,1,0,53.1000,False,False,0,0,...,1,0,1,0,0,0,0,0,1,1
4,0,3,35.0,0,0,8.0500,True,True,1,0,...,0,0,0,0,0,0,0,0,1,0
5,0,3,,0,0,8.4583,True,True,1,1,...,0,0,0,0,0,0,0,1,0,0
6,0,1,54.0,0,0,51.8625,True,True,1,0,...,0,0,0,0,1,0,0,0,1,0
7,0,3,2.0,3,1,21.0750,False,False,1,0,...,0,0,0,0,0,0,0,0,1,0
8,1,3,27.0,0,2,11.1333,False,False,0,0,...,1,0,0,0,0,0,0,0,1,1
9,1,2,14.0,1,0,30.0708,False,False,0,0,...,0,0,0,0,0,0,0,0,0,1


In [36]:
new_df2 = new_df2[['survived', 'age', 'adult_male']].dropna().copy()

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
lgr = LogisticRegression() #instantiate

feature_cols = new_df2[['age', 'adult_male']].copy()
X = feature_cols
y = new_df2['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) #TTS

lgr.fit(X_train, y_train) #fit
preds = lgr.predict(X_test) # predict

In [39]:
skf = StratifiedKFold(n_splits=10, shuffle=True)

X = X_train
y = y_train
print(cross_val_score(lr, X, y, cv=skf))

#stratified splits the categorical variables or values evenly, across the n_splits, so as to avoid imbalance
#this gives the score(R2) for the kfold where cv=the startifiedkfold
#take the avg of the cross val scores and this will tell you how well you will do

[0.28818055 0.56107395 0.29114333 0.13318732 0.2457592  0.4364887
 0.45497226 0.22680775 0.1156404  0.34680031]


# munur

In [50]:
from sklearn.metrics import roc_auc_score

params = {'fit_intercept': (True, False)}
grid_clf_auc = GridSearchCV(lgr, param_grid = params, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)
y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) 

print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))

#ask munur what auc is...???

Test set AUC:  0.8044276296074319


In [351]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

from sklearn.metrics import classification_report
pp.pprint(classification_report(y_test, preds))

#precision:
#recall:
#f1-score:
#support\n:    

('             precision    recall  f1-score   support\n'
 '\n'
 '          0       0.82      0.77      0.80       145\n'
 '          1       0.67      0.74      0.70        91\n'
 '\n'
 'avg / total       0.76      0.76      0.76       236\n')


## clustering

### K nearest neighbors (KNN)

In [352]:
from sklearn.neighbors import KNeighborsClassifier

In [353]:
from sklearn import datasets
iris_df = datasets.load_iris()

In [354]:
X = iris_df.data
y = iris_df.target

In [355]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) #TTS

In [356]:
knn = KNeighborsClassifier(n_neighbors=8) #instantiates

In [357]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='uniform')

In [358]:
knn.predict(X_test)

array([2, 0, 1, 0, 0, 0, 1, 2, 2, 1, 2, 2, 1, 0, 1, 1, 2, 0, 1, 1, 0, 0,
       2, 2, 1, 1, 1, 1, 2, 0, 2, 0, 0, 1, 1, 2, 1, 1, 1, 0, 0, 2, 0, 2,
       1, 0, 0, 0, 2, 0])

In [359]:
knn.score(X_test, y_test) #this gives you accuracy

0.98

### K means (KM)

In [360]:
#dataset needs to be categorical - go with iris

In [361]:
from sklearn.cluster import KMeans
#from sklearn import cluster, datasets, preprocessing, metrics

In [362]:
X = iris_df.data
y = iris_df.target

In [363]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) #TTS

In [364]:
k = 2 # set the number of clusters
kmeans = cluster.KMeans(n_clusters=k, n_init=10) #n_init is the number of iterartions 
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [365]:
kmeans.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [366]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_

In [367]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [368]:
centroids

array([[5.00566038, 3.36037736, 1.56226415, 0.28867925],
       [6.30103093, 2.88659794, 4.95876289, 1.69587629]])

In [369]:
inertia

152.36870647733906

In [370]:
metrics.silhouette_score(X, labels, metric='euclidean')

0.6808136202713508

## decision trees

### decision tree regressor (DTR - used for continuous data)

In [371]:
from sklearn.tree import DecisionTreeRegressor

In [372]:
flights_df.head()

Unnamed: 0,year,month,passengers
0,1949,January,112
1,1949,February,118
2,1949,March,132
3,1949,April,129
4,1949,May,121


In [373]:
new_df3 = pd.get_dummies(flights_df, drop_first=True)

In [374]:
new_df3.head()

Unnamed: 0,year,passengers,month_February,month_March,month_April,month_May,month_June,month_July,month_August,month_September,month_October,month_November,month_December
0,1949,112,0,0,0,0,0,0,0,0,0,0,0
1,1949,118,1,0,0,0,0,0,0,0,0,0,0
2,1949,132,0,1,0,0,0,0,0,0,0,0,0
3,1949,129,0,0,1,0,0,0,0,0,0,0,0
4,1949,121,0,0,0,1,0,0,0,0,0,0,0


In [375]:
X = new_df3.drop('passengers', axis=1).copy()
y = new_df3['passengers']

In [376]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) #TTS

In [377]:
dtr = DecisionTreeRegressor(max_depth=5)

In [378]:
dtr.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [379]:
preds = dtr.predict(X_test)

In [380]:
preds

array([411.42857143, 272.77777778, 334.375     , 164.75      ,
       151.33333333, 164.75      , 185.6       , 222.8125    ,
       133.14285714, 334.375     , 151.33333333, 465.        ,
       334.375     , 185.6       , 334.375     , 164.75      ,
       334.375     , 133.14285714, 222.8125    , 334.375     ,
       334.375     , 222.8125    , 151.33333333, 164.75      ,
       164.75      , 185.6       , 133.14285714, 151.33333333,
       185.6       , 334.375     , 411.42857143, 491.        ,
       185.6       , 164.75      , 222.8125    , 334.375     ,
       334.375     , 334.375     , 441.75      , 334.375     ,
       334.375     , 133.14285714, 185.6       , 505.        ,
       164.75      , 411.42857143, 151.33333333, 334.375     ])

In [381]:
dtr.score(X_test, y_test) #this gives R2

0.9136782710665355

In [382]:
from sklearn.metrics import mean_squared_error

In [383]:
y_true = y_test
y_pred = preds
mean_squared_error(y_true, y_pred) #the smaller the MSE the better

1178.6077757536273

In [384]:
pd.DataFrame({'feature':X.columns, 'importance':dtr.feature_importances_}).sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
0,year,0.903041
6,month_July,0.045442
7,month_August,0.044251
5,month_June,0.005872
10,month_November,0.001394
1,month_February,0.0
2,month_March,0.0
3,month_April,0.0
4,month_May,0.0
8,month_September,0.0


### decision tree classifier (DTC - used for categorical data)

In [385]:
from sklearn.tree import DecisionTreeClassifier

In [386]:
new_df4 = pd.get_dummies(exercise_df, drop_first=True)

In [387]:
new_df4.head()

Unnamed: 0.1,Unnamed: 0,id,pulse,diet_low fat,time_15 min,time_30 min,kind_walking,kind_running
0,0,1,85,1,0,0,0,0
1,1,1,85,1,1,0,0,0
2,2,1,88,1,0,1,0,0
3,3,2,90,1,0,0,0,0
4,4,2,92,1,1,0,0,0


In [388]:
X = new_df4.drop(['Unnamed: 0', 'id', 'diet_low fat'], axis=1).copy()
y = new_df4['diet_low fat']

In [389]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) #TTS

In [390]:
dtc = DecisionTreeClassifier(max_depth=5)

In [391]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [392]:
preds = dtc.predict(X_test)

In [393]:
preds

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0], dtype=uint8)

In [394]:
dtc.score(X_test, y_test) #this gives accuracy

0.4

In [395]:
pd.DataFrame({'feature':X.columns, 'importance':dtc.feature_importances_})

Unnamed: 0,feature,importance
0,pulse,0.888937
1,time_15 min,0.111063
2,time_30 min,0.0
3,kind_walking,0.0
4,kind_running,0.0


In [396]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

from sklearn.metrics import classification_report
pp.pprint(classification_report(y_test, preds))

#precision:
#recall:
#f1-score:
#support\n:    

('             precision    recall  f1-score   support\n'
 '\n'
 '          0       0.39      0.69      0.50        13\n'
 '          1       0.43      0.18      0.25        17\n'
 '\n'
 'avg / total       0.41      0.40      0.36        30\n')


# munur

### random forest

In [None]:
rfc = RandomForestClassifier()
scores = cross_val_score(rfc, X, y, cv=50)
print(np.average(scores))
# print((scores))

rfc.fit(X,y)
pp.pprint(classification_report(rfc.predict(X), y))