# <center>Linear Regression</center><br>
<img src = "Regression.png"></img><br>
#### <div align='right'>Made by: **Asad Mahmood</div>**

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

<a id="toc"></a>

<div class="list-group" id="list-tab" role="tablist">
<h2 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0' role="tab" aria-controls="home"><center>Table of Contents</center></h2>

1. [Introduction](#Intro)
2. [Objective](#Obj)
3. [Exploratory Data Analysis](#EDA)
    1. Understanding Data
    1. Data Preprocessing
    2. Data Exploration
    3. Visual Exploration
4. [Model Building](#Model)
    1. Train and Test Split
    2. Lazy Prediction
    3. Fine Tuning Best Model
5. [Evaluation](#Eval)
6. [Conclusion](#Con)
7. [Recommendations](#Rec)

<a name="Intro"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0' role="tab" aria-controls="home"><center>Introduction</center></h3>


[Return to TOC](#toc)

<a name="Obj"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0' role="tab" aria-controls="home"><center>Objective</center></h3>

[Return to TOC](#toc)

<a name="EDA"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0' role="tab" aria-controls="home"><center>Exploratory Data Analysis</center></h3>

<div class="list-group" id="list-tab" role="tablist">
<h4 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gray; border:0' role="tab" aria-controls="home"><center>Understanding Data</center></h4>

In [1]:
# Check which columns have what % of null values 
for col in df.columns:
    print(col, str(round(100* df[col].isnull().sum() / len(df), 2)) + '%')

NameError: name 'df' is not defined

<div class="list-group" id="list-tab" role="tablist">
<h4 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gray; border:0' role="tab" aria-controls="home"><center>Data Preprocessing</center></h4>

<div class="list-group" id="list-tab" role="tablist">
<h4 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gray; border:0' role="tab" aria-controls="home"><center>Data Exploration</center></h4>

<div class="list-group" id="list-tab" role="tablist">
<h4 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gray; border:0' role="tab" aria-controls="home"><center>Visual Exploration</center></h4>

### Histogram

In [3]:
HEIGHT = 500
WIDTH = 900
NBINS = 50
SCATTER_SIZE=700

def plot_histogram(dataframe, column, color, bins, title, width=WIDTH, height=HEIGHT):
    figure = px.histogram(
        dataframe, 
        column, 
        color=color,
        nbins=bins, 
        title=title, 
        width=width,
        height=height
    )
    figure.update_layout({
            'plot_bgcolor': 'rgba(0, 0, 0, 0)',
            'paper_bgcolor': 'rgba(0, 0, 0, 0)',
        })
    figure.show()

### Violin Plot

In [7]:
fig = px.violin(
    df, 
    x="DEATH_EVENT", 
    y="age", 
    points='all',
    title='Age & DEATH_EVENT box plot',
    width=WIDTH,
    height=HEIGHT,
    box = True
)
fig.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()

'\n\n#fig = px.violin(\n    df, \n    x="DEATH_EVENT", \n    y="age", \n    points=\'all\',\n    title=\'Age & DEATH_EVENT box plot\',\n    width=WIDTH,\n    height=HEIGHT,\n    box = True\n)\nfig.update_layout({\n    \'plot_bgcolor\': \'rgba(0, 0, 0, 0)\',\n    \'paper_bgcolor\': \'rgba(0, 0, 0, 0)\',\n})\n\nfig.show()\n\n'

[Return to TOC](#toc)

<a name="Model"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0' role="tab" aria-controls="home"><center>Model Building</center></h3>

<div class="list-group" id="list-tab" role="tablist">
<h4 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gray; border:0' role="tab" aria-controls="home"><center>Train and Test Split</center></h4>

<div class="list-group" id="list-tab" role="tablist">
<h4 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gray; border:0' role="tab" aria-controls="home"><center>Lazy Prediction</center></h4>

<div class="list-group" id="list-tab" role="tablist">
<h4 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gray; border:0' role="tab" aria-controls="home"><center>Fine Tuning Model</center></h4>

### Hyperparameter tuning

#### Libraries

In [29]:
import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split

#### Reading in Data

In [37]:
df = pd.read_csv('mobile_price_data.csv')

#### Grid Search

In [30]:
X = df.drop('price_range', axis = 1).values
y = df.price_range.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
# Using Random Forest as example 
clf = ensemble.RandomForestClassifier(n_jobs = -1) #This tells my pc to run all cores on the problem
param_grid = {
    #Put the parameters here
    #### Example 1: "n_estimators": [i for i in range(0,50)]#Change 0 and 50 here to your requirments
    "n_estimators": [100, 200, 300, 400],
    "max_depth": [1, 3, 5, 7],
    "criterion": ["gini", "entropy"]
}

# Implementing GridSearch
model = model_selection.GridSearchCV(
    estimator = clf,
    param_grid = param_grid,
    scoring = "accuracy",
    verbose = 10,
    n_jobs = 1,
    cv = 5,
)
model.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_depth=1, n_estimators=100, score=0.606, total=   3.8s
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=100, score=0.609, total=   0.2s
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.9s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=100, score=0.578, total=   0.2s
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=100, score=0.566, total=   0.2s
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.2s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=100, score=0.625, total=   0.2s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.4s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=200, score=0.613, total=   0.3s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    4.6s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=200, score=0.637, total=   0.3s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    4.9s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=200, score=0.569, total=   0.2s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    5.1s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=200, score=0.544, total=   0.2s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    5.4s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=200, score=0.578, total=   0.3s
[CV] criterion=gini, max_depth=1, n_estimators=300 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=300, score=0.597, total=   0.4s
[CV] criterion=gini, max_depth=1, n_estimators=300 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=300, score=0.603, total=   0.4s
[CV] criterion=gini, max_depth=1, n_estimators=300 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=300, score=0.562, total=   0.4s
[CV] criterion=gini, max_depth=1, n_estimators=300 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=300, score=0.578, total=   0.4s
[CV] criterion=gini, max_depth=1, n_estimators=300 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=300, score=0.569, total=   0.5s
[CV] criterion=gini, max_depth=1, n_estimators=400 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=400, score=0.575, total=   0.5s
[CV] criterion

[CV]  criterion=gini, max_depth=7, n_estimators=200, score=0.878, total=   0.3s
[CV] criterion=gini, max_depth=7, n_estimators=200 ...................
[CV]  criterion=gini, max_depth=7, n_estimators=200, score=0.875, total=   0.3s
[CV] criterion=gini, max_depth=7, n_estimators=200 ...................
[CV]  criterion=gini, max_depth=7, n_estimators=200, score=0.875, total=   0.3s
[CV] criterion=gini, max_depth=7, n_estimators=200 ...................
[CV]  criterion=gini, max_depth=7, n_estimators=200, score=0.856, total=   0.3s
[CV] criterion=gini, max_depth=7, n_estimators=200 ...................
[CV]  criterion=gini, max_depth=7, n_estimators=200, score=0.831, total=   0.3s
[CV] criterion=gini, max_depth=7, n_estimators=300 ...................
[CV]  criterion=gini, max_depth=7, n_estimators=300, score=0.875, total=   0.4s
[CV] criterion=gini, max_depth=7, n_estimators=300 ...................
[CV]  criterion=gini, max_depth=7, n_estimators=300, score=0.866, total=   0.4s
[CV] criterion

[CV]  criterion=entropy, max_depth=3, n_estimators=400, score=0.762, total=   0.5s
[CV] criterion=entropy, max_depth=5, n_estimators=100 ................
[CV]  criterion=entropy, max_depth=5, n_estimators=100, score=0.859, total=   0.2s
[CV] criterion=entropy, max_depth=5, n_estimators=100 ................
[CV]  criterion=entropy, max_depth=5, n_estimators=100, score=0.834, total=   0.2s
[CV] criterion=entropy, max_depth=5, n_estimators=100 ................
[CV]  criterion=entropy, max_depth=5, n_estimators=100, score=0.847, total=   0.2s
[CV] criterion=entropy, max_depth=5, n_estimators=100 ................
[CV]  criterion=entropy, max_depth=5, n_estimators=100, score=0.825, total=   0.2s
[CV] criterion=entropy, max_depth=5, n_estimators=100 ................
[CV]  criterion=entropy, max_depth=5, n_estimators=100, score=0.819, total=   0.2s
[CV] criterion=entropy, max_depth=5, n_estimators=200 ................
[CV]  criterion=entropy, max_depth=5, n_estimators=200, score=0.856, total= 

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:   59.8s finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1), n_jobs=1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 3, 5, 7],
                         'n_estimators': [100, 200, 300, 400]},
             scoring='accuracy', verbose=10)

In [32]:
# Printing best accuracy score because labels are equally divided and the estimators
print(model.best_score_)
print(model.best_estimator_.get_params())

0.869375
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 7, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [38]:
#Fine tuned model
clf = ensemble.RandomForestClassifier(**model.best_params_)
clf.fit(X, y)
y_pred = clf.predict(X_test)

In [39]:
metrics.accuracy_score(y_test,y_pred)

0.975

[Return to TOC](#toc)

<a name="Eval"></a>

<a name="Model"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0' role="tab" aria-controls="home"><center>Evaluation</center></h3>

[Return to TOC](#toc)

<a name="Con"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0' role="tab" aria-controls="home"><center>Conclusion</center></h3>

[Return to TOC](#toc)

<a name="Rec"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0' role="tab" aria-controls="home"><center>Recommendation</center></h3>

[Return to TOC](#toc)