# Part 2: Modeling & Evaluation

## Imports

In [None]:
# Standard Imports
import warnings
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

# Modeling & Preprocessing Imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing   import MinMaxScaler
from sklearn.decomposition   import PCA
from sklearn.linear_model    import SGDClassifier
from sklearn.neighbors       import KNeighborsClassifier
from xgboost                 import XGBClassifier

# Custom Modules
import graphs
import model_evaluation

# Appearance
from IPython.core.display import display, HTML

# Notebook Preferences
display(HTML("<style>.container { width:95% !important; }</style>"))
warnings.filterwarnings("ignore")
%matplotlib inline

## Table Of Contents

- [Data Overview](#Data-Overview)
    - [Reading In The Data](#Reading-In-The-Data)
    - [Overview](#Overview)
    
- [Preprocessing](#Preprocessing)
    - [Train-Test Split](#Train-Test-Split)
    - [Principal Component Analysis](#Principal-Component-Analysis)

## Data Overview

### Reading In The Daa

In [None]:
phys_data = pd.read_csv("../Data/clean_phys_data.csv")

### Overview

In [None]:
# Head five rows

phys_data.head()

In [None]:
# Shape of the data

print(f"The dataframe has {phys_data.shape[0]} rows and {phys_data.shape[1]} rows.")

In [None]:
# Data types

phys_data.dtypes

In [None]:
# Missing Values

phys_data.isnull().sum().sum()

## Preprocessing

### Train-Test Split

In [None]:
# Setting up our X and y variables

X = phys_data.drop(["id", "signal"], axis = 1)
y = phys_data["signal"]

In [None]:
# Instantiating the train-test split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    random_state = 42,
                                                    stratify     = y)

In [None]:
# MinMax scaling the data for the PCA

X_mm = MinMaxScaler(X_train)

### Principal Component Analysis

In [None]:
pca = PCA()

In [None]:
X_mm = X_mm

b = pca.fit(X, y)

In [None]:
b.n_components

In [None]:
plt.figure(figsize = (18,6), facecolor = "white")
plt.plot(np.cumsum(b.explained_variance_ratio_), color = "navy")
plt.axvline(5, color = "darkorange")
plt.title("Explained Variance", size = 18)
plt.xlabel("Number Of Commponents", size = 16)
plt.xticks(np.arange(0,55,5), size = 14)
plt.ylabel("Variance %", size = 16)
plt.yticks(np.arange(0,1.1,0.1), size = 14);