<a href="https://colab.research.google.com/github/ath0217/hello-github/blob/main/Lab_Session_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
sns.set_style("darkgrid")

In [None]:
!mkdir data

In [None]:
import gdown

urls = ['https://drive.google.com/uc?export=download&id=1ULDuvWPWD3LmxpQ6FKUHe1prjdM8LmSY', # hitters  https://drive.google.com/file/d/1ULDuvWPWD3LmxpQ6FKUHe1prjdM8LmSY/view?usp=sharing
        'https://drive.google.com/uc?export=download&id=1ofzUqu2idUmxe_DAOPghkR_SCcvLpPgN', # churn_data  https://drive.google.com/file/d/1ofzUqu2idUmxe_DAOPghkR_SCcvLpPgN/view?usp=sharing
      ]
outputs = ['hitters.csv','churn_data.csv']
for url,output in zip(urls,outputs):
  gdown.download(url, f'data/{output}', quiet=False)

**Regularized linear models**


In [None]:
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import KFold, cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('data/hitters.csv', index_col=0).dropna()
df.index.name = 'Player'
df.info()

In [None]:
df.head(3)

In [None]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())

In [None]:
y = df.Salary

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X.info()

In [None]:
X.head(5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=714,test_size=0.20)

In [None]:
alphas = 10**np.linspace(10,-2,100)*0.5
print('{:.4f}'.format(alphas[0]))
print('{:.4f}'.format(alphas[24]))
print('{:.4f}'.format(alphas[49]))
print('{:.4f}'.format(alphas[74]))
print('{:.4f}'.format(alphas[99]))

In [None]:
ridge = Ridge()
coefs = []

for a in alphas:
    ridge.set_params(alpha=a)
    ridge.fit(scale(X_train), y_train)
    coefs.append(ridge.coef_)

ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization');

In [None]:
ridgecv = RidgeCV(alphas = alphas)
ridgecv.fit(X_train, y_train)
print('Best alpha: ', ridgecv.alpha_)
y_pred = ridgecv.predict(X_test)
print('Test MSE: ',mean_squared_error(y_test,y_pred))

In [None]:
lasso = Lasso(max_iter=10000)
coefs = []

for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(scale(X_train), y_train)
    coefs.append(lasso.coef_)

ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization');

In [None]:
lassocv = LassoCV(alphas = alphas,max_iter=10000)
lassocv.fit(X_train, y_train)
print('Best alpha: ', lassocv.alpha_)
y_pred = lassocv.predict(X_test)
print('Test MSE: ',mean_squared_error(y_test,y_pred))

In [None]:
ridge.coef_<-0

In [None]:
eln = ElasticNet(max_iter=10000)
coefs = []

for a in alphas:
    eln.set_params(alpha=a)
    eln.fit(scale(X_train), y_train)
    coefs.append(eln.coef_)

ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Elastic Net coefficients as a function of the regularization');

In [None]:
elasticcv = ElasticNetCV(alphas = alphas,max_iter=10000)
elasticcv.fit(X_train, y_train)
print('Best alpha: ', elasticcv.alpha_)
y_pred = elasticcv.predict(X_test)
print('Test MSE: ',mean_squared_error(y_test,y_pred))

**Decision Analytic Thinking**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import make_scorer, f1_score, confusion_matrix, accuracy_score,ConfusionMatrixDisplay

In [None]:
churn = pd.read_csv('data/churn_data.csv', index_col=0)
churn.head(3)

In [None]:
churn.info()

In [None]:
churn = pd.get_dummies(data= churn, columns=['REPORTED_SATISFACTION',	'REPORTED_USAGE_LEVEL',	'CONSIDERING_CHANGE_OF_PLAN'])
churn = churn[[c for c in churn if c not in ['LEAVE']] 
       + ['LEAVE']]
churn.info()

In [None]:
codes, uniques = pd.factorize(churn['LEAVE'])
churn['LEAVE'] = codes
print(uniques)

In [None]:
churn.head()

In [None]:
sns.countplot(data=churn, x='LEAVE')

In [None]:
X= churn.drop(['LEAVE'],axis=1)
y= churn['LEAVE']

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=714,test_size=0.20,stratify=y)

In [None]:
models = [(DummyClassifier(),{}),
          (DecisionTreeClassifier(class_weight='balanced'),{'max_depth':[None,2,4,6], 'ccp_alpha':[0,0.1,1]}),
          (RandomForestClassifier(class_weight='balanced'),{'max_depth':[None,2,4,6],'n_estimators':[25,50,100]}),
          (GradientBoostingClassifier(),{'max_depth':[None,2,4,6],'n_estimators':[25,50,100]}),
          (LinearDiscriminantAnalysis(),{}),
          (QuadraticDiscriminantAnalysis(),{}),
          (LogisticRegressionCV(max_iter=500, solver='liblinear'),{'penalty':['l1','l2']})
          ]

In [None]:
%%time
f1 = make_scorer(f1_score)
best_models = []
for model, param in models:
  gds = GridSearchCV(model, scoring=f1, refit=True, param_grid=param, cv =3)
  gds.fit(X_train,y_train)
  best_models.append(gds.best_estimator_)

In [None]:
confusion_matrices = []
for model in best_models:
  y_pred = model.predict(X_test)
  confusion_matrices.append(confusion_matrix(y_test,y_pred))
  print(model.__class__.__name__,'score: ',accuracy_score(y_test,y_pred))

In [None]:
sns.set_style("ticks")

In [None]:
disp  = ConfusionMatrixDisplay(confusion_matrices[0],display_labels=uniques)
disp.plot()

In [None]:
cost_benefit_matrix = np.array([[0,0],
                                [-1,99]])
print(cost_benefit_matrix)

In [None]:

confusion_matrices[0]

In [None]:
for i, (conf_mat, model) in enumerate(zip(confusion_matrices, models)):
  print(model[0].__class__.__name__)  
  disp  = ConfusionMatrixDisplay(conf_mat,display_labels=uniques)
  disp.plot()
  plt.show()

In [None]:

for conf_mat, model in zip(confusion_matrices, models):
  print('Expected value of ', model[0].__class__.__name__,": ", np.multiply(conf_mat,cost_benefit_matrix).sum())