<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<h1>Machine Learning </h1>

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

In [None]:
#Importing packages pandas, matplotlib.pyplot, seaborn, numpy, statsmodels
#sklearn.linear_model, train_test_split, LinearRegression, KNeighborsRegressor and StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import statsmodels.formula.api as smf 
import sklearn.linear_model
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.preprocessing import StandardScaler

#Importing excel file as a Dataframe 
file = "./birthweight_low.xlsx"

birthweight = pd.read_excel(io = file,
                           sheet_name = 0,
                           header = 0,
                           na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND',
                                        '-1.#QNAN', '-NaN', '-nan',
        '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
        'nan', 'null', ' ?'])

print(birthweight.head(n=5)) #displaying the dataframe

#Displaying the dimensions of the dataframe
print(f"""
Size of Original Dataset
------------------------
Observations: {birthweight.shape[0]}
Features:     {birthweight.shape[1]}
""")

#Checking is there are missing values in dataframe
birthweight.isnull().any().any()

#Checking how many missing values per feature
birthweight.isnull().sum(axis = 0)

#Checking frequency of missing values per feature
birthweight.isnull().mean().round(decimals = 2)
#Checking the types of the features
birthweight.info()

#Flagging missing values: Creation of new columns with for loop
for col in birthweight:
    if birthweight[col].isnull().astype(int).sum() > 0:
        birthweight['m_'+col] = birthweight[col].isnull().astype(int)
        
#Sum of the missing values into a single new column
birthweight['mv_sum'] = birthweight.iloc[ : , -3: ].sum(axis = 1)

#Checking Number of missing values per feature's unique values
print(f"""

Number of Missing Values per Observation 
----------------------------------------------
{(birthweight['mv_sum'].value_counts(normalize = True,
                                  sort      = True,
                                  ascending = True)*100).sort_index().round(2)}
""")

#Imputing new values to missing ones using the mean or the median 
for col in birthweight:
    if birthweight[col].isnull().sum(axis = 0) > 0:
        mean = birthweight[col].mean()
        median = birthweight[col].median()
        if median > mean: #data skewed if mean is higher than median
            birthweight[col].fillna(value = mean,
                               inplace = True)
        elif mean > median: #data skewed if mean is higher than median
            birthweight[col].fillna(value = median,
                               inplace = True)
        else:
            continue
    elif birthweight[col].isnull().sum(axis = 0) < 0:
        continue
    else:
        continue
        
#Checking if there are any missing values remaining in the dataframe
print(f"""
Missing values remaining in the dataframe?
-----------------------------------------
{birthweight.isnull().any().any()}
""")

#Crossing columns 

birthweight['cigs*drink'] = birthweight['cigs'] * birthweight['drink']
birthweight['meduc*feduc'] = birthweight['meduc'] * birthweight['feduc']
birthweight['fage*mage'] = birthweight['fage'] * birthweight['mage']
birthweight['m_meduc*m_feduc'] = birthweight['m_meduc'] * birthweight['m_feduc']
birthweight['monpre*npvis'] = birthweight['monpre'] * birthweight['npvis']

#Explanations behind crossing features 
print("""*The choice behind combining cigs and drink is to amplify the effect that such
behavior has on a baby's birthweight. This behavior is called crossfating.\n
*Crossing meduc with feduc, as well as m_meduc with m_feduc amplifies the magnitude 
of effects that educated parents has on a baby's birthweight.
*crossing the month the prenatal care began and the nupmber of prenatal visits can 
enhance the baby's health and birthweight.
""")

#Checking new columns in dataframe

print(f"""
cigs*drink
-------------
{birthweight['cigs*drink']}\n
meduc*feduc
-------------
{birthweight['meduc*feduc']}\n
fage*mage
-------------
{birthweight['fage*mage']}\n
m_meduc*m_feduc
-------------
{birthweight['m_meduc*m_feduc']}\n
monpre*npvis
-------------
{birthweight['monpre*npvis']}
""")
#descriptive statistics of features 
print("Descriptive Statistics")
print('*'*100)
print(birthweight.describe())

print("Correlation between bwght and continuous variables")

#Correlation between values y-variable and continuous data 
continuous_data = ['bwght','cigs*drink', 'meduc*feduc', 'fage*mage', 'm_meduc*m_feduc',
       'monpre*npvis']

birthweight_corr = birthweight[continuous_data].corr(method = 'pearson').round(decimals=2)

# specifying plot size 
fig, ax = plt.subplots(figsize=(12,12))


# developing a heatmap
sns.heatmap(data       = birthweight_corr, 
            cmap       = 'gray_r',     
            square     = True,          
            annot      = True,          
            linecolor  = 'black',       
            linewidths = 0.5)          


# title and displaying the plot
plt.title("""
Linear Correlation Heatmap for Birthweight Features
""")
plt.show()

# histogram for bwght (y_variable)
sns.histplot(data   = birthweight,
             x      = 'bwght',
             kde    = True)

#ddiding a mean and median line
plt.axvline(x = birthweight["bwght"].mean(),
           color ="red")
plt.axvline(x = birthweight["bwght"].median(),
           color ="blue")

# title and displaying the plot
plt.title("""
Bwght Distribution
""")
# rendering the plot
plt.show()

#creating logarithm value for bwght
birthweight['log_bwght'] = np.log(birthweight['bwght'])

#creating a new dataframe with some dropped features
df_bwght = birthweight.drop(['omaps', 'fmaps', 'mblck','fwhte' ], axis = 1)

#Making a copy of df_bwght with specific columns
df_birthweight = df_bwght.loc[: ,['male', 'mwhte', 'moth', 'fblck', 'foth', 'bwght', 'm_npvis',
       'mv_sum', 'cigs*drink', 'meduc*feduc', 'fage*mage',
       'm_meduc*m_feduc', 'monpre*npvis', 'log_bwght']].copy()


#Renamming the crossed columns
df_birthweight['crossfading']= df_birthweight['cigs*drink']
df_birthweight['age']= df_birthweight['fage*mage']
df_birthweight['education']= df_birthweight['meduc*feduc']
df_birthweight['prenatal_care']= df_birthweight['monpre*npvis']
df_birthweight['m_education']= df_birthweight['m_meduc*m_feduc']

#dropping old columns
df = df_birthweight.drop(['cigs*drink', 'meduc*feduc', 'fage*mage',
       'm_meduc*m_feduc', 'monpre*npvis'], axis = 1)

print(f"New dataframe columns to be used for OLS regression")
print(df.columns)

#setting x-variables data using full dataset for Lasso and ARD regression models 
birthweight_data= birthweight.drop(['bwght', 'log_bwght'], axis = 1)


# setting explanatory variable data for OLS Regression model
OLS_data   = df.drop(["bwght", "log_bwght"],
                               axis = 1)


#setting y_variables
birthweight_target = df.loc[ : , "bwght"]
log_birthweight_target = df.loc[ : , "log_bwght"] #only for OLS model


# splitting the data into training and testing sets 
#Training and testing datasets for OLS model
x_train, x_test, y_train, y_test = train_test_split(
            OLS_data,
            log_birthweight_target,
            test_size = 0.25,
            random_state = 219)

# Training and testing datasets for Lasso and ARD models
x_train_Full, x_test_Full, y_train_Full, y_test_Full = train_test_split(
            birthweight_data,
            birthweight_target,
            test_size = 0.25,
            random_state = 219)



# checking the shapes of the datasets
print(f"""
OLS Training Data
-------------
X-side: {x_train.shape}
y-side: {y_train.shape}


OLS Testing Data
------------
X-side: {x_test.shape}
y-side: {y_test.shape}

Lasso & ARD Training Data
-------------
X-side: {x_train_Full.shape}
y-side: {y_train_Full.shape}


Lasso & ARD Testing Data
------------
X-side: {x_test_Full.shape}
y-side: {y_test_Full.shape}
""")


# merging X_train and y_train so that they can be used in statsmodels
birthweight_train = pd.concat([x_train, y_train], axis = 1)
# merging X_train and y_train so that they can be used in statsmodels
birthweight_test = pd.concat([x_test, y_test], axis = 1)
# merging X_train and y_train so that they can be used in statsmodels
birthweight_train_Full = pd.concat([x_train_Full, y_train_Full], axis = 1)
# merging X_train and y_train so that they can be used in statsmodels
birthweight_test_Full = pd.concat([x_test_Full, y_test_Full], axis = 1)

print("OLS Regression Model")
# OLS linear regression model 
#training set
lm_best = smf.ols(formula =  """log_bwght ~ 
crossfading +
age +
education +
prenatal_care 
""",
data = birthweight_train)


#fitting the model based on the data
results = lm_best.fit()


#Displaying the summary output
print(results.summary())

#testing set
lm_best = smf.ols(formula =  """log_bwght ~ 
crossfading +
age +
education +
prenatal_care 
""",
data = birthweight_test)


#fitting the model based on the data
results = lm_best.fit()



# displaying summary output
print(results.summary())

#R-squared summary
lr_train_score = 0.72
lr_test_score = 0.687
lr_test_gap = 0.033

#displaying the results
print(f'OLS Training Score :{lr_train_score}')
print(f'OLS Testing Score  :{lr_test_score}') 

#displaying and saving the gap between training and testing
print(f'OLS Train-Test Gap :{lr_test_gap}')

print("Lasso Regression Model")
# Lasso Regression model

lasso_model = sklearn.linear_model.Lasso(alpha = 6.0,
                                         normalize = True) 


# Fitting to the training data
lasso_fit = lasso_model.fit(x_train_Full, y_train_Full)


# PREDICTING on new data
lasso_pred = lasso_fit.predict(x_test_Full)


# SCORING the results
print('Lasso Training Score :', lasso_model.score(x_train_Full, y_train_Full).round(4))
print('Lasso Testing Score  :', lasso_model.score(x_test_Full, y_test_Full).round(4))


# saving scoring data for future use
lasso_train_score = lasso_model.score(x_train_Full, y_train_Full).round(4) 
lasso_test_score  = lasso_model.score(x_test_Full, y_test_Full).round(4)   


# displaying and saving the gap between training and testing
print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)


print("ARD Regression Model")
# ARD Regression Model
ard_model = sklearn.linear_model.ARDRegression()


# FITTING the training data
ard_fit = ard_model.fit(x_train_Full, y_train_Full)


# PREDICTING on new data
ard_pred = ard_fit.predict(x_test_Full)


print('Training Score:', ard_model.score(x_train_Full, y_train_Full).round(4))
print('Testing Score :',  ard_model.score(x_test_Full, y_test_Full).round(4))


# saving scoring data for future use
ard_train_score = ard_model.score(x_train_Full, y_train_Full).round(4)
ard_test_score  = ard_model.score(x_test_Full, y_test_Full).round(4)


# displaying and saving the gap between training and testing
print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

print("Knn Regression Model")
#KNN Regression Model

scaler = StandardScaler()


# FITTING the scaler with housing_data
scaler.fit(birthweight_data)


# TRANSFORMING our data after fit
x_scaled = scaler.transform(birthweight_data)


# converting scaled data into a DataFrame
x_scaled_df = pd.DataFrame(x_scaled)

# creating lists for training set accuracy and test set accuracy
training_accuracy = []
test_accuracy     = []


# building a visualization of 1 to 50 neighbors
neighbors_settings = range(1, 51)


for n_neighbors in neighbors_settings:
    # Building the model
    clf = KNeighborsRegressor(n_neighbors = n_neighbors)
    clf.fit(x_train_Full, y_train_Full)
    
    # Recording the training set accuracy
    training_accuracy.append(clf.score(x_train_Full, y_train_Full))
    
    # Recording the generalization accuracy
    test_accuracy.append(clf.score(x_test_Full, y_test_Full))
    
# finding the optimal number of neighbors
opt_neighbors = test_accuracy.index(max(test_accuracy)) + 1
print(f"""The optimal number of neighbors is {opt_neighbors}""")

# INSTANTIATING a KNN model object
knn_reg = KNeighborsRegressor(algorithm = 'auto',
                              n_neighbors = 9)


# FITTING to the training data
knn_fit = knn_reg.fit(x_train_Full, y_train_Full)


# PREDICTING on new data
knn_reg_pred = knn_fit.predict(x_test_Full)


# SCORING the results
print('KNN Training Score:', knn_reg.score(x_train_Full, y_train_Full).round(4))
print('KNN Testing Score :',  knn_reg.score(x_test_Full, y_test_Full).round(4))


# saving scoring data for future use
knn_reg_score_train = knn_reg.score(x_train_Full, y_train_Full).round(4)
knn_reg_score_test  = knn_reg.score(x_test_Full, y_test_Full).round(4)


# displaying and saving the gap between training and testing
print('KNN Train-Test Gap:', abs(knn_reg_score_train - knn_reg_score_test).round(4))
knn_reg_test_gap = abs(knn_reg_score_train - knn_reg_score_test).round(4)
    



# comparing results of all used models

print(f"""
Model    Train Score        Test Score             Train_Test Gap    
-----    -----------        ----------             --------------
***OLS     {lr_train_score}               {lr_test_score}                 {lr_test_gap}  Final Model
Lasso      {lasso_train_score}              {lasso_test_score}                {lasso_test_gap}
ARD        {ard_train_score}             {ard_test_score}                {ard_test_gap}
KNN        {knn_reg_score_train}              {knn_reg_score_test}                {knn_reg_test_gap}
""")




