In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
import sklearn
import warnings 
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df = pd.read_csv("data/train.csv")
df = df.sample(1200, random_state=786)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().any()

In [None]:
df = df.rename(columns={'blue':'bluetooth', 'fc':'front_cam_mp', 'sc_h':'screen_ht', 'sc_w':'screen_wt'})

In [None]:
df = df.rename(columns={'pc':'back_cam_mp'})

In [None]:
df = df.drop(columns=['m_dep', 'mobile_wt', 'px_height', 'px_width'])

In [None]:
categorical_features = ['bluetooth', 'clock_speed', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
continuous_features = ['battery_power', 'front_cam_mp', 'int_memory', 'n_cores', 'back_cam_mp', 'ram']
TARGET = ['price_range']

In [None]:
df[continuous_features].describe()

In [None]:
df.shape

In [None]:
df[categorical_features].value_counts()

In [None]:
df['battery_power'].describe()

In [None]:
df['bluetooth'].unique()

In [None]:
df['clock_speed'].unique()

In [None]:
df['dual_sim'].value_counts()

In [None]:
df['front_cam_mp'].unique()

In [None]:
df['four_g'].value_counts()

In [None]:
df['int_memory'].describe()

In [None]:
df['n_cores'].value_counts()

In [None]:
df['back_cam_mp'].describe()

In [None]:
df['ram'].describe()

In [None]:
df['screen_ht'].unique()

In [None]:
df['screen_wt'].unique()

In [None]:
df['talk_time'].describe()

In [None]:
df['three_g'].value_counts()

In [None]:
df['touch_screen'].value_counts()

In [None]:
df['four_g'].value_counts()

In [None]:
df['wifi'].value_counts()

In [None]:
df['price_range'].value_counts()

In [None]:
df['clock_speed'] = pd.cut(df['clock_speed'], bins=[0, 1, 2, 3], labels = ['low', 'mid', 'high'])


In [None]:
df.head()

In [None]:
level_map = {'low':0, 'mid':1, 'high':2}
df['clock_speed'] = df['clock_speed'].replace(level_map)

In [None]:
df.head()

In [None]:
df[['screen_ht', 'screen_wt']].describe()

In [None]:
df['screen_ht'] = df['screen_ht'].astype(float)
df['screen_wt'] = df['screen_wt'].astype(float)

In [None]:
x = df[df['screen_wt']==0]['screen_ht'].value_counts().index.tolist()

In [None]:
arr = []
for d in df.loc[df['screen_wt']==0]['screen_ht']:
    arr.append(d)

set(arr)

In [None]:
mean_width = {}
for d in set(arr):
    total = 0
    n = 0
    for width in df.loc[df['screen_ht'] == d]['screen_wt']:
        if width == 0:
            pass
        total += width
        n += 1
        mean = round(total/n, 2)
    print("Mean width for height", d, "=", mean)
    mean_width[d] = mean

In [None]:
for z in x:
    df['screen_wt'] = np.where(((df['screen_wt']==0.0) & (df['screen_ht']==z)), mean_width.get(z), df['screen_wt'])

In [None]:
df.head()

In [None]:
df['screen_size'] = df['screen_ht']**2 + df['screen_wt']**2
df['screen_size'] = np.sqrt(df['screen_size'])
df['screen_size'] = df['screen_size']/2.54
df['screen_size'] = df['screen_size'].round(2)
df.drop(columns=['screen_ht', 'screen_wt'], inplace=True)

In [None]:
p = pd.DataFrame(df['price_range'])
df.drop(columns=TARGET, inplace=True)
df = df.join(p)

# Data Visualisation
## 1 variable 

In [None]:
# Bar Plot
fig = plt.figure(figsize = (6, 4))
title = fig.suptitle("No. of Cores vs.Frequency", fontsize = 14)
fig.subplots_adjust(top=0.85, wspace=0.3)

ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel("No.of Cores")
ax.set_ylabel("Frequency") 
w_q = df['n_cores'].value_counts()
w_q = (list(w_q.index), list(w_q.values))
ax.tick_params(axis='both', which='major', labelsize=8.5)
bar = ax.bar(w_q[0], w_q[1], color='steelblue', 
        edgecolor='black', linewidth=1)

In [None]:
labels = ['low', 'mid', 'high']
df['clock_speed'].value_counts().plot(kind='pie', autopct='%.2f')
plt.tight_layout()
plt.legend(labels)
plt.show()

In [None]:
df['screen_size'].plot(kind='density')

## 2 variable 

In [None]:
plt.hist2d(df['price_range'], df['ram'], bins=(4, 16), cmap='Blues')
cb = plt.colorbar()
cb.set_label('counts in bin')

In [None]:
sns.set(style="ticks", color_codes=True)

In [None]:

# Using subplots or facets along with Bar Plots
fig = plt.figure(figsize = (10, 4))
title = fig.suptitle("Dual_sim vs. talk_time", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)

# Non Dual Sim
ax1 = fig.add_subplot(1,2, 1)
ax1.set_title("Non Dual Sim")
ax1.set_xlabel("Talk-Time")
ax1.set_ylabel("Frequency") 
rw_q = df[df['dual_sim'] == 0]['talk_time'].value_counts()
rw_q = (list(rw_q.index), list(rw_q.values))
ax1.set_ylim([0,70])
ax1.tick_params(axis='both', which='major', labelsize=8.5)
bar1 = ax1.bar(rw_q[0], rw_q[1], color='red', 
               edgecolor='black', linewidth=1)

# Dual Simw
ax2 = fig.add_subplot(1,2, 2)
ax2.set_title("Dual Sim")
ax2.set_xlabel("Talk-time")
ax2.set_ylabel("Frequency") 
ww_q = df[df['dual_sim'] == 1]['talk_time'].value_counts()
ww_q = (list(ww_q.index), list(ww_q.values))
ax2.set_ylim([0, 70])
ax2.tick_params(axis='both', which='major', labelsize=8.5)
bar2 = ax2.bar(ww_q[0], ww_q[1], color='white', 
               edgecolor='black', linewidth=1)

In [None]:
df.boxplot(column='battery_power', by='price_range')
plt.show()

## 3 variable 

In [None]:

# Scaling attribute values to avoid few outiers
cols = ['ram', 'int_memory', 'screen_size', 'battery_power','price_range']
pp = sns.pairplot(df[cols], hue='price_range', size=1.8, aspect=1.8, 
                  palette={0: "#FF9999", 1: "#FFE888", 2:"#2A9D8F", 3:"#E63946"},
                  plot_kws=dict(edgecolor="black", linewidth=0.5))
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Wine Attributes Pairwise Plots', fontsize=14)

In [None]:
sns.relplot(y="int_memory", x="front_cam_mp", hue='four_g', kind="line", data=df)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt



fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(df['battery_power'], df['back_cam_mp'], df['screen_size'], c='r', marker='o')

ax.set_xlabel('Battery Power')
ax.set_ylabel('Back_Camera_Px')
ax.set_zlabel('Screen_Size')

plt.show()

# Cross Validation performance with full set of features

In [None]:
Data= df.drop(columns=['price_range'])
target = df[TARGET]
Data = preprocessing.MinMaxScaler().fit_transform(Data)

from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold

clf = KNeighborsClassifier(n_neighbors=1)
cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=786)
scoring_metric = 'accuracy'
cv_results_full = cross_val_score(estimator=clf, X=Data, y=target, cv=cv_method,scoring=scoring_metric)

cv_results_full.mean().round(2)

# Feature selection using fscore

In [None]:
Data = df.drop(columns=['price_range'])
target = df[TARGET]
Data = preprocessing.MinMaxScaler().fit_transform(Data)

In [None]:
from sklearn import feature_selection as fs
num_features = 8
fs_fit_fscore = fs.SelectKBest(fs.f_classif, k=num_features)
fs_fit_fscore.fit_transform(Data, target)
fs_indices_fscore = np.argsort(np.nan_to_num(fs_fit_fscore.scores_))[::-1][0:num_features]
fs_indices_fscore

In [None]:
best_features_fscore = df.columns[fs_indices_fscore].values
best_features_fscore

In [None]:
feature_importances_fscore = fs_fit_fscore.scores_[fs_indices_fscore]
feature_importances_fscore

In [None]:
import altair as alt

def plot_imp(best_features, scores, method_name, color):
    
    df = pd.DataFrame({'features': best_features, 
                       'importances': scores})
    
    chart = alt.Chart(df, 
                      width=500, 
                      title=method_name + ' Feature Importances'
                     ).mark_bar(opacity=0.75, 
                                color=color).encode(
        alt.X('features', title='Feature', sort=None, axis=alt.AxisConfig(labelAngle=45)),
        alt.Y('importances', title='Importance')
    )
    
    return chart

In [None]:
plot_imp(best_features_fscore, feature_importances_fscore, 'F-Score', 'red')


In [None]:
cv_results_fscore = cross_val_score(estimator=clf,
                             X=Data[:, fs_indices_fscore],
                             y=target, 
                             cv=cv_method, 
                             scoring=scoring_metric)
cv_results_fscore.mean().round(3)

# Feature selection using random forest

In [None]:
Data= df.drop(columns=['price_range'])
target=df[TARGET]
Data=preprocessing.MinMaxScaler().fit_transform(Data)

In [None]:
Data

In [None]:
model_rfi = RandomForestClassifier(n_estimators=100)
model_rfi.fit(Data, target)
fs_indices_rfi = np.argsort(model_rfi.feature_importances_)[::-1][0:num_features]

In [None]:
best_features_rfi = df.columns[fs_indices_rfi].values
best_features_rfi

In [None]:
feature_importances_rfi = model_rfi.feature_importances_[fs_indices_rfi]
feature_importances_rfi

In [None]:
plot_imp(best_features_rfi, feature_importances_rfi, 'Random Forest', 'green')

In [None]:
cv_results_rfi = cross_val_score(estimator=clf,
                             X=Data[:, fs_indices_rfi],
                             y=target, 
                             cv=cv_method, 
                             scoring=scoring_metric)
cv_results_rfi.mean().round(3)

In [None]:
print('Full Set of Features:', cv_results_full.mean().round(3))
print('F-Score:', cv_results_fscore.mean().round(3))
print('RFI:', cv_results_rfi.mean().round(3))

# Model Fitting
## Knn Classification
 

In [None]:
Data = df[best_features_fscore].copy()
target = df[TARGET]
Data = preprocessing.MinMaxScaler().fit_transform(Data)

In [None]:
from sklearn.model_selection import train_test_split

D_train, D_test, t_train, t_test = train_test_split(Data, target, test_size=0.3, random_state=786)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=5, p=2)
knn_classifier.fit(D_train, t_train) 
knn_classifier.score(D_test, t_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def grid_search(D_train, t_train, clf):
   
    if isinstance(clf, KNeighborsClassifier): 
        grid_params = {
        'n_neighbors':[3, 5, 7, 9, 11, 13, 15],
        'p':[1, 2, 3]
        }
    elif isinstance(clf, DecisionTreeClassifier): 
        grid_params = {
        'criterion':['gini','entropy'],
        'min_samples_split':[2, 3, 4],
        'max_depth':[1, 2, 3, 4, 5, 6, 7, 8]
        }
    elif isinstance(clf, RandomForestClassifier):
        grid_params = {
        'n_estimators':[110, 130, 150, 200],
        'criterion':['gini','entropy'],
        'min_samples_split':[2, 3, 4],
        'max_depth':[3, 4, 5]
        
        }
    elif isinstance(clf, SVC):
       grid_params = {
            'C':[1, 10, 50, 100],
            'gamma':[1, 0.1, 0.05, 0.001],
            'kernel':['rbf', 'poly', 'sigmoid']
        }
    else : 
        raise ValueError("unkown classifier")

    gs = GridSearchCV(
        estimator = clf,
        param_grid = grid_params,
        verbose = 3,
        cv = cv_method,
        n_jobs = -1,
        refit = True   
    )

    gs_results = gs.fit(D_train, t_train)
    p = gs_results.best_params_
    model = gs_results.best_estimator_
    return model, p, gs_results

In [None]:
knn_model, knn_best_estimate, knn_result = grid_search(D_train, t_train, knn_classifier)

In [None]:
knn_best_estimate

In [None]:
knn_model.score(D_test, t_test)

In [None]:
results_KNN = pd.DataFrame(knn_result.cv_results_['params'])
results_KNN['test_score'] = knn_result.cv_results_['mean_test_score']
results_KNN.head()


In [None]:
results_KNN['metric'] = results_KNN['p'].replace([1, 2, 3], ["Manhattan", "Euclidean", "Minkowski"])
results_KNN.head()

In [None]:
import altair as alt

alt.Chart(results_KNN, 
          title='KNN Performance Comparison'
         ).mark_line(point=True).encode(
    alt.X('n_neighbors', title='Number of Neighbors'),
    alt.Y('test_score', title='Mean CV Score', scale=alt.Scale(zero=False)),
    color='metric'
)

# Decisoin Tree Clasification

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=786)
dt_classifier.fit(D_train, t_train)
dt_classifier.score(D_test, t_test)

In [None]:
dt_model, dt_best_estimate, dt_result = grid_search(D_train, t_train, dt_classifier)

In [None]:
dt_best_estimate

In [None]:
dt_model.score(D_test, t_test)

In [None]:
sklearn.tree.plot_tree(dt_model, feature_names=best_features_fscore, class_names = ['low', 'mid', 'high', 'v.high'])

In [None]:
results_DT = pd.DataFrame(dt_result.cv_results_['params'])
results_DT['test_score'] = dt_result.cv_results_['mean_test_score']
results_DT.head()

In [None]:
alt.Chart(results_DT, 
          title='DT Performance Comparison'
         ).mark_line(point=True).encode(
    alt.X('max_depth', title='Maximum Depth'),
    alt.Y('test_score', title='Mean CV Score', aggregate='average', scale=alt.Scale(zero=False)),
    color='criterion'
)

In [None]:
rf_classifier = RandomForestClassifier(random_state=786)
rf_classifier.fit(D_train, t_train)
rf_classifier.score(D_test, t_test)

In [None]:
rf_model, rf_best_estimate, rf_result = grid_search(D_train, t_train, rf_classifier)

In [None]:
rf_best_estimate

In [None]:
rf_model.score(D_test, t_test)

In [None]:
results_RF = pd.DataFrame(rf_result.cv_results_['params'])
results_RF['test_score'] = rf_result.cv_results_['mean_test_score']
results_RF.head()

In [None]:
alt.Chart(results_RF, 
          title='RF Performance Comparison'
         ).mark_line(point=True).encode(
    alt.X('max_depth', title='Maximum Depth'),
    alt.Y('test_score', title='Mean CV Score', aggregate='average', scale=alt.Scale(zero=False)),
    color='criterion'
)

In [None]:
svm_classifier = SVC()
svm_classifier.fit(D_train, t_train)
svm_classifier.score(D_test, t_test)

In [None]:
svm_model, svm_best_estimate, svm_result = grid_search(D_train, t_train, svm_classifier)

In [None]:
svm_best_estimate

In [None]:
svm_model.score(D_train, t_train)

In [None]:
results_SVM = pd.DataFrame(svm_result.cv_results_['params'])
results_SVM['test_score'] = svm_result.cv_results_['mean_test_score']
results_SVM.head()

In [None]:
alt.Chart(results_SVM, 
          title='SVM Performance Comparison'
         ).mark_line(point=True).encode(
    alt.X('C', title='Maximum Depth'),
    alt.Y('test_score', title='Mean CV Score', aggregate='average', scale=alt.Scale(zero=False)),
    color='kernel'
)

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
cv_method_ttest = StratifiedKFold(n_splits=10, random_state=786)
cv_results_KNN = cross_val_score(estimator=knn_model, X=Data, y=target, cv=cv_method_ttest, n_jobs=-1, scoring='accuracy')              

In [None]:
cv_results_KNN.mean()

In [None]:
cv_results_RF = cross_val_score(estimator=rf_model, X=Data, y=target, cv=cv_method_ttest, n_jobs=-1, scoring='accuracy')
cv_results_RF.mean()

In [None]:
cv_results_DT = cross_val_score(estimator=dt_model, X=Data, y=target, cv=cv_method_ttest, n_jobs=-1, scoring='accuracy')
cv_results_DT.mean()

In [None]:
cv_results_SVM = cross_val_score(estimator=svm_model, X=Data, y=target, cv=cv_method_ttest, n_jobs=-1, scoring='accuracy')
cv_results_SVM.mean()

In [None]:
from scipy import stats

print(stats.ttest_rel(cv_results_KNN, cv_results_DT))
print(stats.ttest_rel(cv_results_KNN, cv_results_RF))
print(stats.ttest_rel(cv_results_KNN, cv_results_SVM))

print(stats.ttest_rel(cv_results_DT, cv_results_RF))
print(stats.ttest_rel(cv_results_DT, cv_results_SVM))

print(stats.ttest_rel(cv_results_RF, cv_results_SVM))

In [None]:
from sklearn import metrics
def print_model_stats(model, D_test, t_test):
    pred = model.predict(D_test)
    print("=========={model_name} Model Statistics=============".format(model_name=model.__class__.__name__))
    print("Accuracy score:", metrics.accuracy_score(t_test, pred))
    print("Confusion Matrix:\n", metrics.confusion_matrix(t_test, pred))
    print("Classification report:\n", metrics.classification_report(t_test, pred))
    print("Average model accuracy:", metrics.balanced_accuracy_score(t_test, pred))

In [None]:
print_model_stats(knn_model, D_test, t_test)

In [None]:
print_model_stats(dt_model, D_test, t_test)

In [None]:
print_model_stats(rf_model, D_test, t_test)

In [None]:
print_model_stats(svm_model, D_test, t_test)

In [None]:
from scipy import stats
print(stats.ttest_rel(cv_results_KNN, cv_results_DT))
