In [1]:
# importing the libraries
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# init plotly for jupyter notebook
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
# importing the dataset
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
df_train.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
# null values
df_train.isna().mean()

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [5]:
# correlation
fig = go.Figure(
    go.Heatmap(
        x=list(df_train.columns),
        y = list(df_train.columns),
        z=df_train.corr(),
        colorscale="rdbu",
    )
)
fig.update_layout(
    height=700,
    width=800,
    margin=dict(
        pad=2
    ),
    title="Correlation between variables"
    
)

In [6]:
# how variables change with respect to price_range
fig = px.scatter(df_train,x='battery_power',y='price_range')
fig

In [7]:
df_train.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [8]:
fig = px.histogram(x=df_train["battery_power"],color_discrete_sequence=["orange"],nbins=100)
fig.update_layout(
    title="Variable Distribution",
    plot_bgcolor="white",
    xaxis_title="battery_power",
    showlegend=False
)

## Data Pre Processing

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [30]:
df_train.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [10]:
x = df_train.iloc[:,:-1]
y = df_train.iloc[:,-1]

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

## Train Test Split

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(x,y,test_size=0.25)

## Decision Tree

In [75]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion="gini")
dtc.fit(x_train,y_train)

# make predictions
y_train_pred = dtc.predict(x_train)
print(f"Accuracy is {(y_train_pred == y_train).mean()*100}%")

y_test_pred = dtc.predict(x_test)
print(f"Accuracy is {(y_test_pred == y_test).mean()*100}%")

Accuracy is 100.0%
Accuracy is 81.8%


## Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression()
lrc.fit(x_train,y_train)

# make predictions
y_train_pred = lrc.predict(x_train)
print(f"Accuracy is {(y_train_pred == y_train).mean()*100}%")

y_test_pred = lrc.predict(x_test)
print(f"Accuracy is {(y_test_pred == y_test).mean()*100}%")

Accuracy is 97.93333333333332%
Accuracy is 95.8%


## Random Forest

In [96]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100,random_state=1123133,max_depth=8)
rfc.fit(x_train,y_train)

# make predictions
y_train_pred = rfc.predict(x_train)
print(f"Accuracy is {(y_train_pred == y_train).mean()*100}%")

y_test_pred = rfc.predict(x_test)
print(f"Accuracy is {(y_test_pred == y_test).mean()*100}%")

Accuracy is 99.2%
Accuracy is 84.0%


## K Nearest Neighbors

In [68]:
from sklearn.neighbors import KNeighborsClassifier
knnc = KNeighborsClassifier(n_neighbors=5)
knnc.fit(x_train,y_train)

# make predictions
y_train_pred = knnc.predict(x_train)
print(f"Accuracy is {(y_train_pred == y_train).mean()*100}%")

y_test_pred = knnc.predict(x_test)
print(f"Accuracy is {(y_test_pred == y_test).mean()*100}%")

Accuracy is 71.33333333333334%
Accuracy is 53.0%
