# Sklearn
This is the most widely used libary for machine learning.

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.neighbors import KNeighborsClassifier

churn_df = pd.read_csv("Utility_files/datasets_course1_DC/telecom_churn_clean.csv")
# print(churn_df.head())
y = churn_df["churn"].values
features = ["account_length", "customer_service_calls"]
X = churn_df[features].values

model = KNeighborsClassifier(n_neighbors=6)
model.fit(X, y)

acc_len_max = churn_df["account_length"].max() 
acc_len_min =  churn_df["account_length"].min()
cus_se_ca_max = churn_df["customer_service_calls"].max()
cus_se_ca_min = churn_df["customer_service_calls"].min()

# create 10 random numbers in the range of values of each of the 2 features
X_new_1 = np.random.randint(acc_len_min, acc_len_max, size=(10, 1))
X_new_2 = np.random.randint(cus_se_ca_min, cus_se_ca_max, size=(10, 1))
print(X_new_1)
print(X_new_2)
X_new = np.concatenate([X_new_1, X_new_2], axis=1)
print(X_new)

pred = model.predict(X_new)
print(pred.reshape(10,1))


In [None]:
from sklearn.model_selection import train_test_split

y_name = "churn"
X = churn_df.drop(y_name, axis=1).values
y = churn_df[y_name]

test_size = 0.3
random_state=21
x_train, x_test,y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)

print("PRINT MODEL's ACCURACY")
print(knn.score(x_test, y_test))

In [None]:
# find the best hyperparameter for the model:
knn = None
k_test_acc = {}
k_train_acc = {}
for k in range(1, 25):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    k_test_acc[k] = knn.score(x_test, y_test)
    k_train_acc[k] = knn.score(x_train, y_train)


## LINEAR REGRESSION

In [None]:
from sklearn.linear_model import LinearRegression
sales_df = pd.read_csv("Utility_files/datasets_course1_DC/advertising_and_sales_clean.csv")

X = sales_df.drop(["sales", "influencer"], axis=1).values
y = sales_df["sales"].values

test_size=0.3
random_sate=42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

reg = LinearRegression()

reg.fit(X_train,y_train)


y_pred = reg.predict(X_test)
# evaluating the model

from sklearn.metrics import mean_squared_error
# R^2 is the default metric for Linear regression
r_squared = reg.score(X_test, y_test)
rmse = mean_squared_error(y_pred, y_test, squared=False)


## CROSS VALIDATION
Dividing the data into training and test data might not be enough for securing a good performance. A more robust evaluation is the Cross validation or more technically the $K$ fold crossvalidation as follows:
1. divide the data in $K$ sets
2. At each time use one fold as testing set and the rest $K - 1$ ensembled as training dataset
3. calculate the metric of interest with each fold, and having a final performce metric by applying statistical method to the ensemble of metrics obtained in the previous step.


In [1]:
from sklearn.model_selection import cross_val_score, KFold

n_splits = 6
random_state = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

reg = LinearRegression()
cv_scores = cross_val_score(reg, X, y, cv=x)
print(cv_scores)

NameError: name 'LinearRegression' is not defined

## Regularization
The regularization techniques can be used to prevent overfitting. Yet, with the wrong hyperparameters it can have a negative side effect. Here is a live demo in sklearn

In [None]:
from sklearn.linear_model import Ridge # This molde is regularized linear regression with L2 method
# test for different values of alphas

alpha_hyper = [10 ** exp for exp in range(-4, 5)]

ridge_scores = []
for alpha in alpha_hyper:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    ridge_scores.append(ridge.score(X_test, y_test))

print(ridge_scores)

In [None]:
from sklearn.linear_model import Lasso # this model is regularized linear regression with the sum of absolute values of parameters

lasso = Lasso(alpha=0.3)
lasso.fit(X_train, y_train)
lasso_coeff = lasso.coef_


In [None]:
db_df = pd.read_csv("Utility_files/datasets_course1_DC/diabetes_clean.csv")
print(db_df.head())
X = db_df.drop("diabetes", axis=1).values
y = db_df["diabetes"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(confusion_matrix(y_pred, y_test)) # this is wrong !!
print(classification_report(y_pred, y_test)) # this is wrong !! 
# it should be as follows:

print(confusion_matrix(y_test, y_pred)) # the ground-truth data is passed before the predictions
print(classification_report(y_test, y_pred))  



## LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(y_pred)


## HyperParameter Tuning.
It can be done using two main approaches. Either GridSearch:setting the possible values for the hyperparameters and trying all the different combinations. Yet, this might not be optimal with a large number of parameters: Thus we use Random Search.

In [None]:
from sklearn.model_selection import GridSearchCV

params_grid = {"alpha": np.array([10 ** i for i in range(-4, 4)])}
n_splits = 6
random_state = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

lasso_cv = GridSearchCV(lasso, params_grid, cv=kf)
lasso_cv.fit(X_train, y_train)
print(lasso_cv.score(X_test,y_test))
print(lasso_cv.best_score_)
print(lasso_cv.best_params_)

# the same principle can be applied with RandomSearchCV.
# 


## Dummy_Variables
In order for the sklean package to function properly, all inputs must be numerical. The general approach to convert categorical variables it to expand the possible values and convert them to binary variables while dropping the initial categorical variable. In other words: assume a categorical variables has the following values: $[v_1, v_2..., v_n]$ Then $v_i$ will be converted into a binary variable where usually $1$ will mean that the input vector satisfies that value and $0$ otherwise.

In [None]:
music_df = pd.read_csv("Utility_files/datasets_course1_DC/music_clean.csv").iloc[:, 1:]
print(music_df.head())
music_dummies = pd.get_dummies(music_df["genre"], drop_first=True)
print(music_dummies.head())
# music_df = pd.concatenate(music_df, music_dummies, axis=1) # add the new "binary" variables to the original DataFrame
# music_df.drop("genre", inplace=True) # drop the old categorial column

#### NOTE: if there is only one categorical column in a dataframe, then teh get_dummies will do all of the work for us: returning the correct
# number of additional columns after conversion.



## Dealing with missing DATA
It might be necessary to fill the missing data with some values out of the present data. we can use sklearn for effecting data imputing

In [None]:
from sklearn.impute import SimpleImputer

# let's import the diabetes data
music = pd.read_csv("Utility_files/datasets_course1_DC/music_clean.csv")
X_cat = music["genre"].values.reshape(-1, 1) # since it is only one column
X_num = music.drop(["genre", "popularity"], axis=1).values
y = music["popularity"].values

random_state = np.random.randint(50)
X_cat_train, X_cat_test, y_train, y_test = train_test_split(X_cat, y, random_state=random_state)
X_num_train, X_num_test, y_train, y_test = train_test_split(X_num, y, random_state=random_state)

imputer_cat = SimpleImputer(strategy='most_frequent')
X_cat_train = imputer_cat.fit_transform(X_cat_train)
X_cat_test = imputer_cat.transform (X_cat_test)


In [None]:
imputer_num = SimpleImputer(strategy='mean') # the default btw
X_num_train = imputer_num.fit_transform(X_num_train)
X_num_test = imputer_num.transform(X_num_test)

X_train = np.append(X_num_train, X_cat_train, axis=1)
X_test = np.append(X_num_test, X_cat_test, axis=1)

In [None]:
print(music.isna().sum().sort_values())

In [None]:
# it is possible to use the Pipeline model as well 
from sklearn.pipeline  import Pipeline
# initialize an imputer

num_imp = SimpleImputer()
log_reg = LogisticRegression()

steps = [("imputer", num_imp), ("logistic_regression", log_reg)]

pip = Pipeline(steps)
pip.fit(X_train, y_train)

y_pred = pip.predict(X_test)
print(confusion_matrix(y_test, y_pred))

## Centring and scaling
Imputing values is not the sole preprocessing step. Data should be at the same range of values. There are two main approaches:
1. normalization: subtract by min and divide by the range: all values would now range from $0$:minimum to $1$: maximum
2. standarization: substract the mean and divide by the variance: the new data is of mean $0$ and variance $1$

In [None]:
from sklearn.preprocessing import StandardScaler

X = music.drop("genre", axis=1).values
y = music["genre"].values
random_state = np.random.randint(50)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train_scaled = scaler.fit_transform(y_train)
y_test_scaled = scaler.transform(y_test)
