In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_train = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv', encoding='utf-8')
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/mobile-price-classification/test.csv', encoding='utf-8')
df_test.head()

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
print(df_train.duplicated().sum())
print(df_test.duplicated().sum())

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.plot(kind='box', subplots=True, figsize=(20,20), layout=(5,5))
plt.show()

In [None]:
df_train['three_g'].value_counts()

In [None]:
df_test.plot(kind='box', subplots=True, figsize=(20,20), layout=(5,5))
plt.show()

In [None]:
plt.figure(figsize=(20,7))
sns.heatmap(df_train.corr() , annot = True, cmap = "Blues")

## Modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 

sc = StandardScaler()
x = df_train.drop(['price_range'] , axis = 1).values
y =df_train['price_range'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [None]:
print(x_train.shape)
print(x_test.shape)

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)

print(logreg.score(x_train, y_train))
print(logreg.score(x_test, y_test))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth= 5, max_features= 17)
dt.fit(x_train, y_train)
print (dt.score(x_train, y_train))
print (dt.score(x_test, y_test))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 5, max_depth=5 ,max_features= 17)
rf.fit(x_train , y_train)
print (rf.score(x_train , y_train))
print (rf.score(x_test , y_test))

### features importance

In [None]:
from sklearn import tree
fig = plt.figure(figsize= (15,12))
tree.plot_tree(dt, filled= True)

In [None]:
i = df_train.drop(['price_range'] , axis = 1)
j =df_train['price_range']

In [None]:
i.columns

In [None]:
def f_importances(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)

    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.title('feature importances')
    plt.show()

# whatever your features are called
features_names = i.columns

# Specify your top n features you want to visualize.
# You can also discard the abs() function 
# if you are interested in negative contribution of features
f_importances(abs(dt.feature_importances_), features_names, top=6)


In [None]:
(pd.Series(dt.feature_importances_, index=i.columns).plot(kind='barh'))  

## XGBOOST

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators = 15, max_depth = 3)
xgb.fit(x_train, y_train)
print(xgb.score(x_train, y_train))
print(xgb.score(x_test, y_test))

## ADABOOST


In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit(x_train, y_train)
print(ada.score(x_train, y_train))
print(ada.score(x_test, y_test))

In [None]:
ada.get_params()

 ## Voting

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
clf1 = LogisticRegression()
clf2 = AdaBoostClassifier()
clf3 = XGBClassifier(n_estimators = 15, max_depth = 3)
clf4 = RandomForestClassifier(n_estimators= 5, max_depth=5 ,max_features= 17)

In [None]:
v_clf = VotingClassifier(estimators=[('logreg', clf1), ('ada', clf2), ('xgb', clf3), ('RF', clf4)], voting='soft')
v_clf.fit(x_train, y_train)

In [None]:
#soft voting
print(v_clf.score(x_train, y_train))
print(v_clf.score(x_test, y_test))

In [None]:
v_clf = VotingClassifier(estimators=[('logreg', clf1), ('ada', clf2), ('xgb', clf3), ('RF', clf4)], voting='hard')
v_clf.fit(x_train, y_train)

In [None]:
#hard voting
print(v_clf.score(x_train, y_train))
print(v_clf.score(x_test, y_test))

### Select best features

### Evaluation

In [None]:
y_pred = xgb.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
con = confusion_matrix(y_test, y_pred)
con

In [None]:
#pip install mlxtend

In [None]:
from mlxtend.plotting import plot_confusion_matrix
plot_confusion_matrix(con)

In [None]:
y_test.shape

In [None]:
print(classification_report(y_test, xgb.predict(x_test)))

In [None]:
print(classification_report(y_test, rf.predict(x_test)))

In [None]:
print(classification_report(y_test, dt.predict(x_test)))

### Cross Validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scoring = 'accuracy'
score = cross_val_score(rf, x, y, cv = kfold, scoring = scoring)
print(score)

In [None]:
round(np.mean(score)*100, 2)

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': [2,3,4,5], 'max_features': [7,8,9], 'n_estimators': [1,7,8,9]}
param_grid

In [None]:
clf = RandomForestClassifier()
grid = GridSearchCV(estimator=clf, param_grid = param_grid, cv=5)
grid_result = grid.fit(x_train, y_train)
grid_result

In [None]:
print('Best: %f using %s' %(grid_result.best_score_, grid_result.best_params_))

In [None]:
model = grid_result.best_estimator_
model

In [None]:
con = confusion_matrix()
con

In [None]:
#confusion matrix
#classification report
#cross validations
#grid search
#plot_roc_curve