# Model Training


### Feature Engineering

#### Steps 
* Exploring Features of the dataset
* Handling Missing Values
* Feature Engineering
* Feature Selection 

#### Import Packages and CSV

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.pandas.set_option("display.max_columns", None)
# Create Dataframe
df = pd.read_csv("C:/Users/dhanu/Downloads/New folder/New folder/cleaned_wppus.csv")

# Print shape of dataset
print(df.shape)

(266, 126)


##  Data cleaning and Data Transformation

### Feature Extraction

In EDA file , wind data has been seperated from other data.

### Other Data Manipulations

* **Based on the In-Sights from EDA, We Drop Columns which are not much impacting the target variable**

In [None]:
No columns dropped

In [23]:
df = df.replace(
    to_replace='--', 
    value=0)

df = df.replace(
    to_replace='NM', 
    value=0)


In [9]:
print(df)

       date  other_united_states  other_new_england  other_connecticut  \
0    Jan-01                 6025                678                 75   
1    Feb-01                 5310                599                 77   
2    Mar-01                 5795                660                 96   
3    Apr-01                 5827                593                 92   
4    May-01                 5738                640                 72   
5    Jun-01                 5964                633                 71   
6    Jul-01                 6244                632                 72   
7    Aug-01                 6293                684                 72   
8    Sep-01                 5770                635                 69   
9    Oct-01                 6050                551                 69   
10   Nov-01                 5713                604                 70   
11   Dec-01                 6038                650                 73   
12   Jan-02                 6599      

### Types of Features 

* **Numerical Features**

In [24]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

Num of Numerical Features : 89


* **Categorical Features**

In [26]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

Num of Categorical Features : 37


* **Discrete Features**

In [27]:
discrete_feature=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_feature))

Num of Discrete Features : 9


In [28]:
year_features=[feature for feature in df.columns if 'date' in feature or 'month' in feature]
print('Num of Year Features :',len(year_features))


Num of Year Features : 1


* **Continues Features**

In [29]:
continuous_feature=[feature for feature in num_features if feature not in discrete_feature+year_features]
print('Num of Continuous Features :',len(continuous_feature))

Num of Continuous Features : 80


## Feature Selection

* Multicollinearity occurs when there are two or more independent variables in a multiple regression model, which have a high correlation among themselves. When some features are highly correlated.
* Multicollinearity can be detected using various techniques, one such technique being the Variance Inflation Factor(VIF).

### Outlier Removal and Checking Skewness

* For Skewed distributions: Use Inter-Quartile Range (IQR) proximity rule.

* The data points which fall below Q1 – 1.5 IQR or above Q3 + 1.5 IQR are outliers.

* where Q1 and Q3 are the 25th and 75th percentile of the dataset respectively, and IQR represents the inter-quartile range and given by Q3 – Q1.

In [None]:
for i in continuous_feature:
        plt.figure(figsize=(10,6))
        sns.set_style('ticks')
        ax = sns.boxplot(df[i])

In [None]:
def detect_outliers(col):
    # Finding the IQR
    percentile25 = df[col].quantile(0.25)
    percentile75 = df[col].quantile(0.75)
    print('\n ####', col , '####')
    print("percentile25",percentile25)
    print("percentile75",percentile75)
    iqr = percentile75 - percentile25
    upper_limit = percentile75 + 1.5 * iqr
    lower_limit = percentile25 - 1.5 * iqr
    print("Upper limit",upper_limit)
    print("Lower limit",lower_limit)
    df.loc[(df[col]>upper_limit), col]= upper_limit
    df.loc[(df[col]<lower_limit), col]= lower_limit    
    return df

In [None]:
for col in continuous_feature:
         detect_outliers(col)

In [None]:
for i in continuous_feature:
        plt.figure(figsize=(10,6))
        sns.set_style('ticks')
        ax = sns.boxplot(df[i])

In [None]:
#Date split to month and year

df['date'] =  pd.to_datetime(df['date'], format='%b-%y')
df.date.dt.year
df.date.dt.month


* **Split Dataframe to X and y**

* **Here we set a variable X i.e, independent columns, and a variable y i.e, dependent column as the “other_maine” column.**

In [30]:
from sklearn.model_selection import train_test_split
X = df.drop(['other_maine','date'],axis=1)
#X=df.drop(['date'],axis=1)
y = df['other_maine']
pd.pandas.set_option("display.max_columns", None)
print(df.shape)
X.shape

(266, 126)


(266, 124)

* **Check all columns which are in Train data set**

In [11]:
all_columns  = list(X.columns)

print(all_columns) 
len(all_columns)

['other_united_states', 'other_new_england', 'other_connecticut', 'other_massachusetts', 'other_new_hampshire', 'other_rhode_island', 'other_vermont', 'other_middle_atlantic', 'other_new_jersey', 'other_new_york', 'other_pennsylvania', 'other_east_north_central', 'other_illinois', 'other_indiana', 'other_michigan', 'other_ohio', 'other_wisconsin', 'other_west_north_central', 'other_iowa', 'other_kansas', 'other_minnesota', 'other_missouri', 'other_nebraska', 'other_north_dakota', 'other_south_dakota', 'other_south_atlantic', 'other_delaware', 'other_district_of_columbia', 'other_florida', 'other_georgia', 'other_maryland', 'other_north_carolina', 'other_south_carolina', 'other_virginia', 'other_west_virginia', 'other_east_south_central', 'other_alabama', 'other_kentucky', 'other_mississippi', 'other_tennessee', 'other_west_south_central', 'other_arkansas', 'other_louisiana', 'other_oklahoma', 'other_texas', 'other_mountain', 'other_arizona', 'other_colorado', 'other_idaho', 'other_mont

124

###  Feature Encoding

* **Extracting Categorical features from train set for feature encoding**

In [31]:
cat_features = [feature for feature in X.columns if X[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

Num of Categorical Features : 36


In [32]:
df=df.fillna(df.mean())


* **Checking for Unique variables in each column**

In [33]:
for feature in cat_features:
    print(feature,':', X[feature].nunique())

other_kansas : 216
other_delaware : 16
other_district_of_columbia : 7
other_alaska : 26
wind_new_england : 148
wind_connecticut : 4
wind_maine : 142
wind_massachusetts : 29
wind_new_hampshire : 54
wind_rhode_island : 25
wind_new_jersey : 5
wind_illinois : 208
wind_indiana : 158
wind_michigan : 158
wind_ohio : 113
wind_kansas : 215
wind_missouri : 138
wind_north_dakota : 206
wind_south_atlantic : 151
wind_delaware : 3
wind_florida : 2
wind_maryland : 57
wind_north_carolina : 40
wind_virginia : 6
wind_west_virginia : 125
wind_east_south_central : 10
wind_tennessee : 10
wind_oklahoma : 216
wind_arizona : 78
wind_idaho : 137
wind_montana : 148
wind_nevada : 31
wind_new_mexico : 187
wind_utah : 72
wind_washington : 215
wind_alaska : 20


## Pipeline for DataTransformation

 **One Hot Encoding for Columns which had lesser unique values and not ordinal**

* One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction.


**Binary Encoder is used for Country which had 170 unique values**
* Binary encoding is a combination of Hash encoding and one-hot encoding. In this encoding scheme, the categorical feature is first converted into numerical using an ordinal encoder. Then the numbers are transformed in the binary number. After that binary value is split into different columns.

* Binary encoding works really well when there are a high number of categories, Like Countries in our case

In [None]:
# Create Column Transformer with 3 types of transformers
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
#from category_encoders.binary import BinaryEncoder
num_features = [feature for feature in X.columns if X[feature].dtype != 'O']
oh_columns = ['other_united_states','other_connecticut','other_texas','other_new_england','other_new_hampshire']
bin_columns = ['date']

numeric_transformer = StandardScaler()
#bin_transformer = BinaryEncoder()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
         #('binary' ,bin_transformer, bin_columns),
         ("oh", oh_transformer, oh_columns),
          ("num", numeric_transformer, num_features)
    ]
)

In [None]:
# Apply Column Transformer
X = preprocessor.fit_transform(X)


In [None]:
print(y.dtype)
print(y.shape)
print(X.shape)
print(X_train.shape)

In [None]:
#import pickle
#pickle.dump(preprocessor, open('preprocess.pkl', 'wb'))

## Train Test Split

In [34]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=36)
X_train.shape, X_test.shape

((199, 124), (67, 124))

## Model Selection

* **Here should understand the Various Classification models with default values from these models we can choose top 4 with Highest Accuracy score and proceed with HyperParameter Tuning**

In [35]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, \
                            precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
#from catboost import CatBoostClassifier

In [19]:
#X_train=X_train.fillna(X_train.mean())
#np.isnan(X_train.values.any())
X_train[:] = np.nan_to_num(X_train)
X_test[:] = np.nan_to_num(X_test)

In [36]:
from sklearn import preprocessing
from sklearn import utils

#convert y values to categorical values from continuous
lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

#view transformed values
print(y_transformed)



[ 67  19  48   6  55  45  44  83  58   0  24  63  80  32  65  25  54  58
  76  67  68  49  40  69  35   8  22   2  21  14  39  77  49   3  24  13
  12  30  45   2  20   4  16  44  30   6  26  78  63  33  72   7  48  47
  76  96  79  29  66  89  73  49  66  11  10  45  55  81  68  45  39  78
  69  51  62   9  49  89  90  98  92  53  59  94  99  57  70  43  39  31
  76  87  42  17  65  59  74  64  44  23   1  30  32  82  37  32  95  87
  74  75  93  32  57  72  88  81  34  28  42  97 110  89 106  27  39  96
 102 100  82  53 116 118 113 101 102  33  18  90 108 106 112 122 104 141
 131 111 129  38  48 121 122 123 118  98 141 125 130  94 125  60  33  62
  91  59  71  67 115 117 140 116 132  71   5  22  49  62  67  78 123 109
 128 122 101  53  36  67  61  77  42  43  84 153 151 145 154 103  78  79
  63  85  72  98 144 157 159 135 156  94  56  83  84  50  86 136 136 140
 149 148 146 120  51  40  35  19  52  69 134 152 102 105 137 127 107  15
   6  34  66  72 123 126 143  88 158  97  48  98  4

In [None]:
print(X_train.shape)
print(y_transformed.shape)
print(y_transformed.dtype)


In [1]:
print(df)


NameError: name 'df' is not defined

In [37]:
models = {
    #"Random Forest": RandomForestClassifier(),
    #"Naive Bayes": GaussianNB(),
    #"Gradient Boosting": GradientBoostingClassifier(),
    #"Logistic Regression": LogisticRegression(),
   # "K-Neighbors Classifier": KNeighborsClassifier(),
   # "XGBClassifier": XGBClassifier(), 
   #  "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    # "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier()
}
#X_test.toarray()
#X_train.toarray()
#X.toarray()
#import numpy as np
#from scipy.sparse import csr_matrix
#temp = csr_matrix(X_train)
#X_train = temp.toarray()

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_transformed) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    
    print('='*35)
    print('\n')

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

### Hyper Parameter Tuning

In [None]:
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}

rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20, 30],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}

cat_params = {"learning_rate": [0.1, 0.01],
              "max_depth": [5, 8, 12, 20, 30]}

In [None]:
randomcv_models = [('KNN', KNeighborsClassifier(), knn_params),
                   ("RF", RandomForestClassifier(), rf_params),
                  # ('XGBoost', XGBClassifier(), xgboost_params),
                  # ('CatBoost', CatBoostClassifier(verbose=False), cat_params)
                   ]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    rf_random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    rf_random.fit(X_train, y_train)
    model_param[name] = rf_random. best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

### Models Re-Trained with Best Parameters

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
models = {
    "Random Forest": RandomForestClassifier(n_estimators=1000, min_samples_split=2, max_features= 7, max_depth= None),
    "K-Neighbors Classifier": KNeighborsClassifier(n_neighbors=10),
    #"XGBClassifier": XGBClassifier(n_estimators= 200, max_depth= 30, learning_rate= 0.1, colsample_bytree= 0.4, n_jobs=-1), 
    #"CatBoosting Classifier": CatBoostClassifier(max_depth= 12, learning_rate= 0.1,verbose=False),
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)

    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc



    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
    
    print('='*35)
    print('\n')

### Plot ROC-AUC curve

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
plt.figure()

# Add the models to the list that you want to view on the ROC plot
auc_models = [
{
    'label': 'Random Forest Classifier',
    'model': RandomForestClassifier(n_estimators=1000, min_samples_split=2, max_features= 'auto', max_depth= None),
    'auc': 0.8618
},
{
    'label': 'XGBoost Classifier',
    'model': XGBClassifier(n_estimators= 200, max_depth= 20, learning_rate= 0.1, colsample_bytree= 0.8, n_jobs=-1),
    'auc': 0.8073
},
{
    'label': 'KNN Classifier',
    'model': KNeighborsClassifier(n_neighbors=10),
    'auc': 0.8629 
},
{
    'label': 'CatBoost Classifier',
    'model': CatBoostClassifier(max_depth= 12, learning_rate= 0.1,verbose=False),
    'auc': 0.8615
},
    
]

# Below for loop iterates through your models list
for m in auc_models:
    model = m['model'] # select the model
    model.fit(X_train, y_train) # train the model
# Compute False postive rate, and True positive rate
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
# Calculate Area under the curve to display on the plot
# Now, plot the computed values
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], m['auc']))
# Custom settings for the plot 
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positwive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig(r"./images/roc_auc/auc.png")
plt.show()   # Display

### Feature Selection for Model Deployment

In [None]:
best_xgb = XGBClassifier(n_estimators= 200, max_depth= 30, learning_rate= 0.1, colsample_bytree= 0.4, n_jobs=-1)
best_xgb = best_xgb.fit(X_train,y_train)
xgb_pred = best_xgb.predict(X_test)
score = accuracy_score(y_test,xgb_pred)
cr = classification_report(y_test,xgb_pred)

print("FINAL XGB")
print ("Accuracy Score value: {:.4f}".format(score))
print (cr)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(best_xgb, X_test, y_test)

In [None]:
import pickle
 
# Save the trained model as a pickle file.
pickle.dump(best_xgb, open('classificationmodel.pkl', 'wb'))