In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("/Users/bianca/Documents/GitHub/DAT-10-14/class material/Unit3/data/housing.csv")

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
from sklearn.linear_model import Lasso, Ridge

In [4]:
X = df.iloc[:,:-1]
y = df['PRICE']

In [13]:
#need to standardize!
X = (X-X.mean())/X.std()

In [14]:
#almost always useful to either use Lasso or Ridge vs. a regular regression

In [15]:
lasso = Lasso()

In [16]:
lasso.fit(X,y)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [17]:
#in terms of internals (coefs, etc) - lasso + ridge return similar to linreg

In [18]:
lasso.coef_

array([-0.        ,  0.        , -0.        ,  0.        , -0.        ,
        2.71542789, -0.        , -0.        , -0.        , -0.        ,
       -1.34428304,  0.18036988, -3.54677609])

In [12]:
#if a coef set to zero, functionally removed from dataset (anything multiplied by zero, not important)
#note above how some columns have '0' coefficients (typical for lasso)

In [20]:
l1_term = np.sum(np.abs(lasso.coef_))

In [21]:
l1_term

7.786856908757779

In [22]:
alpha = 10

In [23]:
l1_term_2 = alpha*np.sum(np.abs(lasso.coef_))

In [24]:
l1_term_2

77.86856908757778

In [25]:
#one little tweak can be made to alpha 
#training scores tend to be higher than test scores

In [27]:
from sklearn.model_selection import cross_val_score

In [28]:
alphas = np.logspace(-4,4,9)

In [33]:
cv_scores = []

for alpha in alphas:
    lasso.set_params(alpha = alpha)
    scores = cross_val_score(estimator = lasso, X=X, y=y, cv=10)
    cv_scores.append((np.mean(scores), alpha))

In [34]:
cv_scores

[(0.20262012278999347, 0.0001),
 (0.20343535169466956, 0.001),
 (0.21144430759385185, 0.01),
 (0.2407890782467927, 0.1),
 (0.1807548507575551, 1.0),
 (-1.2860830508551744, 10.0),
 (-1.2860830508551744, 100.0),
 (-1.2860830508551744, 1000.0),
 (-1.2860830508551744, 10000.0)]

In [35]:
lasso.get_params()

{'alpha': 10000.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [36]:
# 'tol' - after evvery iteration, compares loss function from this round to previous rown

In [37]:
from sklearn.ensemble import RandomForestRegressor

In [38]:
rf = RandomForestRegressor()

In [49]:
rf.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [40]:
rf.feature_importances_

array([0.0435009 , 0.0009928 , 0.00627515, 0.0010515 , 0.02077635,
       0.44122283, 0.01395953, 0.06452823, 0.00195213, 0.01328215,
       0.01906685, 0.01029556, 0.36309602])

In [41]:
np.sum(rf.feature_importances_)

1.0

In [42]:
#according to how random forest fit dataset, the relative importance of every feature or column

In [43]:
rf.feature_importances_

In [44]:
#let's turn into a dataframe

In [48]:
feats = pd.DataFrame({
    'Features' : X.columns, 
    'Importance' : rf.feature_importances_
}).sort_values(by='Importance', ascending = False)

In [58]:
feats

Unnamed: 0,Features,Importance
5,RM,0.441223
12,LSTAT,0.363096
7,DIS,0.064528
0,CRIM,0.043501
4,NOX,0.020776
10,PTRATIO,0.019067
6,AGE,0.01396
9,TAX,0.013282
11,B,0.010296
2,INDUS,0.006275


In [51]:
#notice that the numbers are all positive 

In [52]:
rf.score(X,y)

0.9773890528048839

In [53]:
#numbers do not considered coefs but gives indication of how senstive the columns are

In [54]:
#pearsons correlation - linear association 

In [55]:
#spearmans correlation - measures rank correlation 

In [56]:
def signmoid(x):
    return 1/(1+np.exp(-x))

In [57]:
signmoid(0)

0.5

In [59]:
#odds that a particular event will happen

In [60]:
train = pd.read_csv("/Users/bianca/Documents/GitHub/DAT-10-14/class material/Unit3/data/titanic/train.csv")
test = pd.read_csv("/Users/bianca/Documents/GitHub/DAT-10-14/class material/Unit3/data/titanic/test.csv")

In [63]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [67]:
X_train = train.loc[:,['Sex','Pclass']]
y_train = train.loc[:,'Survived']
X_test = test.loc[:,['Sex','Pclass']]

In [68]:
X_train = pd.get_dummies(X_train, columns=['Sex', 'Pclass'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['Sex', 'Pclass'], drop_first=True)

In [71]:
#drop column because wrapped in the interceptor

In [72]:
#do it for linear models 

In [73]:
#logistic regression

In [74]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver = 'liblinear')
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [75]:
#C is the equivalent of alpha (ridge/lasso)
#with logistic regressions, coef are much smaller 

In [76]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [77]:
logreg.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [79]:
#this is differnt than linear model
#what it tells you... first number is chance that "0", second number is chance that "1"
logreg.predict_proba(X_test)

array([[0.89503473, 0.10496527],
       [0.41462909, 0.58537091],
       [0.75093089, 0.24906911],
       [0.89503473, 0.10496527],
       [0.41462909, 0.58537091],
       [0.89503473, 0.10496527],
       [0.41462909, 0.58537091],
       [0.75093089, 0.24906911],
       [0.41462909, 0.58537091],
       [0.89503473, 0.10496527],
       [0.89503473, 0.10496527],
       [0.60521334, 0.39478666],
       [0.11295975, 0.88704025],
       [0.75093089, 0.24906911],
       [0.11295975, 0.88704025],
       [0.20028547, 0.79971453],
       [0.75093089, 0.24906911],
       [0.89503473, 0.10496527],
       [0.41462909, 0.58537091],
       [0.41462909, 0.58537091],
       [0.60521334, 0.39478666],
       [0.89503473, 0.10496527],
       [0.11295975, 0.88704025],
       [0.60521334, 0.39478666],
       [0.11295975, 0.88704025],
       [0.89503473, 0.10496527],
       [0.11295975, 0.88704025],
       [0.89503473, 0.10496527],
       [0.60521334, 0.39478666],
       [0.89503473, 0.10496527],
       [0.

In [82]:
logreg.predict_proba(X_test)[:,1]

array([0.10496527, 0.58537091, 0.24906911, 0.10496527, 0.58537091,
       0.10496527, 0.58537091, 0.24906911, 0.58537091, 0.10496527,
       0.10496527, 0.39478666, 0.88704025, 0.24906911, 0.88704025,
       0.79971453, 0.24906911, 0.10496527, 0.58537091, 0.58537091,
       0.39478666, 0.10496527, 0.88704025, 0.39478666, 0.88704025,
       0.10496527, 0.88704025, 0.10496527, 0.39478666, 0.10496527,
       0.24906911, 0.24906911, 0.58537091, 0.58537091, 0.39478666,
       0.10496527, 0.58537091, 0.58537091, 0.10496527, 0.10496527,
       0.10496527, 0.39478666, 0.10496527, 0.79971453, 0.88704025,
       0.10496527, 0.39478666, 0.10496527, 0.88704025, 0.58537091,
       0.39478666, 0.24906911, 0.79971453, 0.88704025, 0.24906911,
       0.10496527, 0.10496527, 0.10496527, 0.10496527, 0.88704025,
       0.10496527, 0.24906911, 0.10496527, 0.58537091, 0.39478666,
       0.79971453, 0.58537091, 0.39478666, 0.39478666, 0.88704025,
       0.58537091, 0.10496527, 0.58537091, 0.39478666, 0.88704

In [84]:
logreg.intercept_

array([2.06085883])

In [85]:
logreg.coef_

array([[-2.48809433, -0.67634771, -1.7159975 ]])

In [86]:
coeffs = pd.DataFrame({
    'Variable':X_train.columns,
    'Weight':logreg.coef_[0]
})

In [87]:
coeffs

Unnamed: 0,Variable,Weight
0,Sex_male,-2.488094
1,Pclass_2,-0.676348
2,Pclass_3,-1.715997


In [88]:
output = X_train.dot(logreg.coef_.T) + logreg.intercept_

In [90]:
#output is Y before going through the sigmoid function
output.head()

Unnamed: 0,0
0,-2.143233
1,2.060859
2,0.344861
3,2.060859
4,-2.143233


In [92]:
#then take the Y predictions and get the sigmoid (predicted probability)
signmoid(-2.143233)

0.10496526891511966

In [93]:
from sklearn.ensemble import RandomForestClassifier

In [94]:
rf = RandomForestClassifier()

In [95]:
rf.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [96]:
rf.feature_importances_

array([0.71777034, 0.02615389, 0.25607577])

In [97]:
rf.predict_proba(X_test)

array([[0.8707089 , 0.1292911 ],
       [0.51065891, 0.48934109],
       [0.83826195, 0.16173805],
       [0.8707089 , 0.1292911 ],
       [0.51065891, 0.48934109],
       [0.8707089 , 0.1292911 ],
       [0.51065891, 0.48934109],
       [0.83826195, 0.16173805],
       [0.51065891, 0.48934109],
       [0.8707089 , 0.1292911 ],
       [0.8707089 , 0.1292911 ],
       [0.61429765, 0.38570235],
       [0.03504741, 0.96495259],
       [0.83826195, 0.16173805],
       [0.03504741, 0.96495259],
       [0.07324683, 0.92675317],
       [0.83826195, 0.16173805],
       [0.8707089 , 0.1292911 ],
       [0.51065891, 0.48934109],
       [0.51065891, 0.48934109],
       [0.61429765, 0.38570235],
       [0.8707089 , 0.1292911 ],
       [0.03504741, 0.96495259],
       [0.61429765, 0.38570235],
       [0.03504741, 0.96495259],
       [0.8707089 , 0.1292911 ],
       [0.03504741, 0.96495259],
       [0.8707089 , 0.1292911 ],
       [0.61429765, 0.38570235],
       [0.8707089 , 0.1292911 ],
       [0.

In [98]:
#takes the value of all individual trees and where they fell