In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import scipy
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pd.set_option('display.max_columns', None)
data = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')
p0 = data[data['price_range']==0]
p1 = data[data['price_range']==1]
p2 = data[data['price_range']==2]
p3 = data[data['price_range']==3]

sep0 = int(0.8*len(p0))
sep1 = int(0.8*len(p1))
sep2 = int(0.8*len(p2))
sep3 = int(0.8*len(p3))

df_train = pd.concat([p0[:sep0], p1[:sep1], p2[:sep2], p3[:sep3]])
df_test = pd.concat([p0[sep0:], p1[sep1:], p2[sep2:], p3[sep3:]])

In [3]:
df_train = df_train.sample(frac=1)

### Column's descriptions:
**battery_power:** Total energy a battery can store in one time measured in mAh

**blue:** Has bluetooth or not

**clock_speed:** speed at which microprocessor executes instructions

**dual_sim:** Has dual sim support or not

**fc:** Front Camera mega pixels

**four_g:** Has 4G or not

**int_memory:** Internal Memory in Gigabytes

**m_dep:** Mobile Depth in cm

**mobile_wt:** Weight of mobile phone

**n_cores:** Number of cores of processor

**pc:** Primary Camera mega pixels

**px_height:** Pixel Resolution Height

**px_width:** Pixel Resolution Width

**ram:** Random Access Memory in Mega Bytes

**sc_h:** Screen Height of mobile in cm

**sc_w:** Screen Width of mobile in cm

**talk_time:** longest time that a single battery charge will last when you are

**three_g:** Has 3G or not

**touch_screen:** Has touch screen or not

**wifi:** Has wifi or not

**price_range:** This is the target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).

In [4]:
df_train.shape

In [5]:
df_train.info()

In [6]:
df_train.describe()

### **Delete outliers**

In [7]:
for cols in df_train.columns:
    if df_train[cols].dtype == 'int64' or df_train[cols].dtype == 'float64':
        Q1 = df_train[cols].quantile(0.25)
        Q3 = df_train[cols].quantile(0.75)
        IQR = Q3 - Q1
        lower_range = Q1 - 1.5 * IQR
        upper_range = Q3 + 1.5 * IQR
        
        indexs = df_train[(df_train[cols] > upper_range) | (df_train[cols] < lower_range)].index
        df_train.drop(indexs, inplace=True)
df_train

### **Check if we have duplicated rows**

In [8]:
df_train.drop_duplicates(inplace = True)
df_train.shape

### **Unique values of each column**

In [9]:
for c in df_train.columns:
    print('Column *' + c + '* unique values:')
    print(df_train[c].unique())

### **Data visualization**

In [10]:
sns.displot(data=df_train, x="ram", hue="price_range", col="price_range")
plt.show()

In [11]:
plt.figure(figsize=(6, 8))
sns.set(style="darkgrid")
sns.boxplot(x=df_train["price_range"], y=df_train["ram"], color='purple')
plt.show()

In [12]:
plt.figure(figsize=(6, 8))
sns.set(style="darkgrid")
sns.boxplot(x=df_train["price_range"], y=df_train["battery_power"], color='purple')
plt.show()

In [13]:
chart=alt.Chart(df_train).mark_circle(size=20).encode(
    x='ram',
    y='battery_power',
    color='price_range:N'
).interactive().properties(
    width=600, height=300
)
chart

In [14]:
plt.figure(figsize=(6, 8))
sns.set(style="darkgrid")
df = sns.load_dataset('iris')
sns.boxplot(x=df_train["price_range"], y=df_train["ram"], hue=df_train['four_g'], color='purple')
plt.show()

In [15]:
plt.figure(figsize=(10,6))
sns.violinplot(x='price_range',y="ram", hue='four_g',data=df_train, palette='rainbow')
plt.title("Violin Plot of Price Range by Ram, Separated by 4G")

In [16]:
chart=alt.Chart(df_train).mark_circle(size=20).encode(
    x='n_cores',
    y='ram',
    color='price_range'
).interactive().properties(
    width=600, height=300
)
chart

In [17]:
f, ax = plt.subplots(figsize=(25, 25))
plt.subplot(2,3,1)
ax=sns.swarmplot(x="four_g", y="ram", hue="price_range",
              palette="Dark2", data=df_train)
ax=sns.set(style="darkgrid")

plt.subplot(2,3,2)
ax=sns.swarmplot(x="three_g", y="ram", hue="price_range",
              palette="Dark2", data=df_train)
ax=sns.set(style="darkgrid")

plt.subplot(2, 3, 3)
ax=sns.swarmplot(x="wifi", y="ram", hue="price_range",
              palette="Dark2", data=df_train)
ax=sns.set(style="darkgrid")

In [18]:
chart=alt.Chart(df_train).mark_circle(size=20).encode(
    x='px_width',
    y='px_height',
    color='price_range:N'
).interactive().properties(
    width=600, height=300
)
chart

### **Hypothesis tests**

### q1: The average number of cores, in price range of 3 is 6.

In [19]:
statics, p = scipy.stats.ttest_1samp(df_train[df_train['price_range'] == 3].n_cores, 6)
print("p value:", p)

##### p value is less than 0.05 so that the H0 is declined.

### q2: The average amount of battery power is 1241.

In [20]:
statics, p_value = scipy.stats.ttest_1samp(df_train["battery_power"], 1241)
print("p value:", p_value)

##### p value is more than 0.05 so that the H0 is accepted. (due to data description we know that statement is correct.)

### q3: Price range and touch screen are not related. (both categorical)

In [21]:
contingency_table = pd.crosstab(df_train["touch_screen"], df_train["price_range"], margins=True)
contingency_table

In [22]:
chi2_stat , p_value , dof , expected = scipy.stats.chi2_contingency(contingency_table.values)
print("p value:", p_value)

##### p value is more than 0.05. H0 is accepted and they are not realated.

### q4: Price range and bluetooth are not related. (both categorical)

In [23]:
contingency_table = pd.crosstab(df_train["blue"], df_train["price_range"], margins=True)
contingency_table

In [24]:
chi2_stat , p_value , dof , expected = scipy.stats.chi2_contingency(contingency_table.values)
print("p value:", p_value)

##### p value is more than 0.05. H0 is accepted and they are not realated.

### q5: Internal memory and ram are not realted. (both numerical)

In [25]:
cor , p_value = scipy.stats.spearmanr(df_train["int_memory"],df_train["ram"])
print("p value:", p_value)

##### p value is more than 0.05. H0 is accepted and they are not realated.

### **Logistic regression**

In [26]:
l_train = df_train["price_range"]
df_train.drop(columns="price_range", inplace=True)

l_test = df_test["price_range"]
df_test.drop(columns="price_range", inplace=True)

In [27]:
logreg = LogisticRegression(multi_class='ovr')
logreg.fit(df_train, l_train)

In [28]:
lgr_predict = logreg.predict(df_test)
logreg.score(df_test, l_test)

In [29]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

In [30]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500)
logreg.fit(df_train, l_train)

In [31]:
lgr_predict = logreg.predict(df_test)
logreg.score(df_test, l_test)

In [32]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

In [33]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train, l_train)

In [34]:
lgr_predict = logreg.predict(df_test)
logreg.score(df_test, l_test)

In [35]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

### **Handling imbalanced data**

In [36]:
l_train.value_counts()

### **Scaling data**

#### **Min-Max scaler**

In [37]:
mms = preprocessing.MinMaxScaler().fit(df_train)
df_train_mms = mms.transform(df_train)
df_test_mms = mms.transform(df_test)

In [38]:
df_train_mms = pd.DataFrame(df_train_mms, columns=df_train.columns)
df_test_mms = pd.DataFrame(df_test_mms, columns=df_train.columns)

In [39]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_mms, l_train)

In [40]:
lgr_predict = logreg.predict(df_test_mms)
logreg.score(df_test_mms, l_test)

In [41]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

#### **Quantile scaler**

In [42]:
quantile = QuantileTransformer(output_distribution='normal')

df_train_q = quantile.fit_transform(df_train)
df_test_q = quantile.transform(df_test)

In [43]:
df_train_q = pd.DataFrame(df_train_q, columns=df_train.columns)
df_test_q = pd.DataFrame(df_test_q, columns=df_train.columns)

In [44]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_q, l_train)

In [45]:
lgr_predict = logreg.predict(df_test_q)
logreg.score(df_test_q, l_test)

In [46]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

#### **Robust scaler**

In [47]:
trans = RobustScaler()

df_train_rb = trans.fit_transform(df_train)
df_test_rb = trans.transform(df_test)

In [48]:
df_train_rb = pd.DataFrame(df_train_rb, columns=df_train.columns)
df_test_rb = pd.DataFrame(df_test_rb, columns=df_train.columns)

In [49]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_rb, l_train)

In [50]:
lgr_predict = logreg.predict(df_test_rb)
logreg.score(df_test_rb, l_test)

In [51]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

#### **Standard scaler**

In [52]:
trans = StandardScaler()

df_train_st = trans.fit_transform(df_train)
df_test_st = trans.transform(df_test)

In [53]:
df_train_st = pd.DataFrame(df_train_st, columns=df_train.columns)
df_test_st = pd.DataFrame(df_test_st, columns=df_train.columns)

In [54]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_st, l_train)

In [55]:
lgr_predict = logreg.predict(df_test_st)
logreg.score(df_test_st, l_test)

In [56]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

### **PCA**

#### **pov = 0.75**

In [57]:
pca = PCA(n_components = 0.75, svd_solver = 'full')
 
df_train_75 = pca.fit_transform(df_train_st)
df_test_75 = pca.transform(df_test_st)

In [58]:
pca.n_components_

In [59]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_75, l_train)

In [60]:
lgr_predict = logreg.predict(df_test_75)
logreg.score(df_test_75, l_test)

In [61]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

#### **pov = 0.8**

In [62]:
pca = PCA(n_components = 0.8, svd_solver = 'full')
 
df_train_8 = pca.fit_transform(df_train_st)
df_test_8 = pca.transform(df_test_st)

In [63]:
pca.n_components_

In [64]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_8, l_train)

In [65]:
lgr_predict = logreg.predict(df_test_8)
logreg.score(df_test_8, l_test)

In [66]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

#### **pov = 0.9**

In [67]:
pca = PCA(n_components = 0.9, svd_solver = 'full')
 
df_train_9 = pca.fit_transform(df_train_st)
df_test_9 = pca.transform(df_test_st)

In [68]:
pca.n_components_

In [69]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_9, l_train)

In [70]:
lgr_predict = logreg.predict(df_test_9)
logreg.score(df_test_9, l_test)

In [71]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

#### **pov = 0.95**


In [72]:
pca = PCA(n_components = 0.95, svd_solver = 'full')
 
df_train_95 = pca.fit_transform(df_train_st)
df_test_95 = pca.transform(df_test_st)

In [73]:
pca.n_components_

In [74]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_95, l_train)

In [75]:
lgr_predict = logreg.predict(df_test_95)
logreg.score(df_test_95, l_test)

In [76]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

#### **pov = 0.99**


In [77]:
pca = PCA(n_components = 0.99, svd_solver = 'full')
 
df_train_99 = pca.fit_transform(df_train_st)
df_test_99 = pca.transform(df_test_st)

In [78]:
pca.n_components_

In [79]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_99, l_train)

In [80]:
lgr_predict = logreg.predict(df_test_99)
logreg.score(df_test_99, l_test)

In [81]:
cm = confusion_matrix(l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Predicted 0s', 'Predicted 1s', 'Predicted 2s', 'Predicted 3s'))
ax.yaxis.set(ticks=(0, 1, 2, 3), ticklabels=('Actual 0s', 'Actual 1s', 'Actual 2s', 'Actual 3s'))
ax.set_ylim(3.5, -0.5)
for i in range(4):
    for j in range(4):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

### **imbalencing the data**

In [82]:
imb_l_train = l_train.replace([1,2,3],4)
imb_l_test = l_test.replace([1,2,3],4)
imb_l_train.value_counts()

In [83]:
imb_l_train.value_counts().plot.pie(autopct='%.2f')

#### **Random oversampling**

In [84]:
ros = RandomOverSampler(sampling_strategy="not majority")
df_train_res, l_train_res = ros.fit_resample(df_train, imb_l_train)

ax = l_train_res.value_counts().plot.pie(autopct='%.2f')
_ = ax.set_title("Over-sampling")

In [85]:
trans = StandardScaler()

df_train_res_st = trans.fit_transform(df_train_res)

In [86]:
df_train_res_st = pd.DataFrame(df_train_res_st, columns=df_train.columns)

In [87]:
logreg = LogisticRegression(multi_class='ovr', max_iter=500, solver='newton-cg')
logreg.fit(df_train_res_st, l_train_res)

In [88]:
lgr_predict = logreg.predict(df_test_st)
logreg.score(df_test_st, imb_l_test)

In [89]:
cm = confusion_matrix(imb_l_test, lgr_predict)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 4s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 4s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()