In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')
train

### Column's descriptions:
**battery_power:** Total energy a battery can store in one time measured in mAh

**blue:** Has bluetooth or not

**clock_speed:** speed at which microprocessor executes instructions

**dual_sim:** Has dual sim support or not

**fc:** Front Camera mega pixels

**four_g:** Has 4G or not

**int_memory:** Internal Memory in Gigabytes

**m_dep:** Mobile Depth in cm

**mobile_wt:** Weight of mobile phone

**n_cores:** Number of cores of processor

**pc:** Primary Camera mega pixels

**px_height:** Pixel Resolution Height

**px_width:** Pixel Resolution Width

**ram:** Random Access Memory in Mega Bytes

**sc_h:** Screen Height of mobile in cm

**sc_w:** Screen Width of mobile in cm

**talk_time:** longest time that a single battery charge will last when you are

**three_g:** Has 3G or not

**touch_screen:** Has touch screen or not

**wifi:** Has wifi or not

**price_range:** This is the target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).

In [3]:
train.isna().sum()

In [4]:
train.info()

In [5]:
train.price_range.nunique()

In [6]:
train.groupby('price_range').count()

In [7]:
#two price classes (low -> 0 & high -> 1)
train.price_range.replace(1, 0, inplace=True)
train.price_range.replace([2, 3], 1, inplace=True)

In [8]:
train.groupby('price_range').count()

In [9]:
x_train, x_test, y_train, y_test = train_test_split(train.drop(columns=['price_range']), train.price_range, test_size=0.2, random_state=0)

In [10]:
from sklearn.metrics import auc

def forward_selection(X, y):
  features = []
  final_features = {'features': [], 'scores': []}
  rem_features = X.columns
  for i in range(len(X.columns)):
    max_score = 0
    best_feature = ""
    best_score = 0
    for feature in rem_features:
      new_X = X[features + [feature]]
      X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.2, random_state=0)
      logisticRegr = LogisticRegression()
      logisticRegr.fit(X_train, y_train)
      score = logisticRegr.score(X_test, y_test)
      if score > max_score :
        max_score = score
        best_feature = feature
        best_score = score
    rem_features = rem_features.drop(best_feature)
    features.append(best_feature)

    final_features['features'].append(best_feature)
    final_features['scores'].append(best_score)
  final_features['features_rank'] = range(len(X.columns))

  best_index = 0
  mx_feature = 0
  for i in range(len(final_features['scores'])):
    if final_features['scores'][i] > mx_feature:
      mx_feature = final_features['scores'][i] 
      best_index = i

  final_features['best_features'] = final_features['features'][:best_index]
  return final_features

In [11]:
#scaling
standard_scaler = StandardScaler()
x_train[x_train.columns] = standard_scaler.fit_transform(x_train[x_train.columns])

In [12]:
x_test[x_test.columns] = standard_scaler.transform(x_test[x_test.columns])

In [13]:
fs = forward_selection(x_train, y_train)
print(fs)

In [14]:
plt.plot(fs['features_rank'], fs['scores'])

In [15]:
print(fs['best_features'])

In [16]:
logisticReg = LogisticRegression()
logisticReg.fit(x_train[['ram', 'battery_power', 'px_height', 'px_width', 'blue', 'clock_speed']], y_train)
y_pred = logisticReg.predict(x_test[['ram', 'battery_power', 'px_height', 'px_width', 'blue', 'clock_speed']])
print(classification_report(y_pred, logisticReg.predict(x_test[['ram', 'battery_power', 'px_height', 'px_width', 'blue', 'clock_speed']])))

In [17]:
pca = PCA(n_components = 6)
pca.fit(x_train)

pca_train = pca.transform(x_train)
pca_test = pca.transform(x_test)

In [18]:
pca_model = LogisticRegression()
pca_model.fit(pca_train, y_train)
print(classification_report(y_test, pca_model.predict(pca_test)))

In [19]:
cl = [0, 1, 2, 3]
train['battery_bin'] = pd.cut(train.battery_power, bins = 4, labels = cl)

In [20]:
x_train_bin, x_test_bin, y_train_bin, y_test_bin = train_test_split(train.drop(columns=['price_range']),
                                                                    train.price_range, test_size=0.2, random_state=0)

In [21]:
bin_svm = svm.SVC()
bin_svm.fit(x_train_bin.drop(columns='battery_power'), y_train_bin)

In [22]:
cm = confusion_matrix(y_test_bin, bin_svm.predict(x_test_bin.drop(columns='battery_power')))
fig, ax = plot_confusion_matrix(conf_mat = cm)
plt.show()

In [23]:
pd.cut(x_train_bin.battery_power, bins = 10).value_counts()

In [24]:
min(x_train_bin.battery_power)

In [25]:
max(x_train_bin.battery_power)

In [26]:
cb = [min(x_train_bin.battery_power), 700, 1000, 1300, 1600, max(x_train_bin.battery_power)]
pd.cut(x_train_bin.battery_power, bins = cb).value_counts()

In [27]:
train.fc.hist(grid=False)

In [28]:
np.log(train.fc+0.000001).hist(grid=False)

In [29]:
train['log_fc'] = np.log(train.fc+0.000001)

In [30]:
x_train_log, x_test_log, y_train_log, y_test_log = train_test_split(train.drop(columns=['price_range', 'battery_bin', 'fc']), 
                                                                    train.price_range, test_size=0.2, random_state=0)

In [31]:
log_svm = svm.SVC()
log_svm.fit(x_train_log, y_train_log)

In [32]:
cm = confusion_matrix(y_test_log, log_svm.predict(x_test_log))
fig, ax = plot_confusion_matrix(conf_mat = cm)
plt.show()

In [33]:
train['area'] = train.sc_w * train.sc_h

In [34]:
x_train_area, x_test_area, y_train_area, y_test_area = train_test_split(train.drop(columns=['price_range', 'battery_bin', 'log_fc']), 
                                                                    train.price_range, test_size=0.2, random_state=0)

In [35]:
area_svm = svm.SVC()
area_svm.fit(x_train_area, y_train_area)

In [36]:
cm = confusion_matrix(y_test_area, area_svm.predict(x_test_area))
fig, ax = plot_confusion_matrix(conf_mat = cm)
plt.show()

In [37]:
categorical_features = []
for feature in train.columns.to_list():
  if ( len(train[feature].unique()) < 3 ):
    categorical_features.append(feature)

categorical_features.remove('price_range')
categorical_features

In [38]:
train = pd.get_dummies(train, columns=categorical_features, prefix=categorical_features)
train.sample(3)

In [39]:
x_train_one_hot, x_test_one_hot, y_train_one_hot, y_test_one_hot = train_test_split(
    train.drop(columns=['price_range', 'battery_bin', 'log_fc', 'area']), 
    train.price_range, test_size=0.2, random_state=0)

In [40]:
one_hot_svm = svm.SVC()
one_hot_svm.fit(x_train_one_hot, y_train_one_hot)

In [41]:
cm = confusion_matrix(y_test_one_hot, one_hot_svm.predict(x_test_one_hot))
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

In [42]:
x_train_mix, x_test_mix, y_train_mix, y_test_mix = train_test_split(
    train.drop(columns=['battery_power', 'fc', 'price_range']), train.price_range, test_size=0.2, random_state=0)

In [43]:
mix_svm = svm.SVC()
mix_svm.fit(x_train_mix, y_train_mix)

In [44]:
cm = confusion_matrix(y_test_mix, mix_svm.predict(x_test_mix))
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

In [89]:
def backward_selection(X, y):
  features = X.columns
  final_features = {'features': [], 'scores': []}
  rem_features = X.columns
  for i in range(len(X.columns)):
    max_score = 0
    worst_feature = ""
    worst_score = 0
    for feature in rem_features:
      new_X = X[features.drop(feature)]
      X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.2, random_state=0)
      logisticRegr = LogisticRegression()
      logisticRegr.fit(X_train, y_train)
      score = logisticRegr.score(X_test, y_test)
      if score > max_score :
        max_score = score
        worst_feature = feature
        worst_score = score
    rem_features = rem_features.drop(worst_feature)
    features.drop(worst_feature)

    final_features['features'].append(worst_feature)
    final_features['scores'].append(worst_score)
  final_features['features_rank'] = range(len(X.columns))

  best_index = 0
  mx_feature = 0
  for i in range(len(final_features['scores'])):
    if final_features['scores'][i] > mx_feature:
      mx_feature = final_features['scores'][i] 
      best_index = i

  final_features['best_features'] = final_features['features'][:best_index]

  
  return final_features

In [90]:
bs = backward_selection(x_train, y_train)
print(bs)

In [92]:
logisticReg = LogisticRegression()
logisticReg.fit(x_train[['int_memory', 'clock_speed', 'fc', 'four_g', 'm_dep', 'talk_time', 'three_g', 'blue', 'dual_sim',
                         'n_cores', 'pc', 'sc_h', 'sc_w', 'wifi', 'touch_screen', 'mobile_wt', 'px_width', 'px_height', 'battery_power', 'ram']], y_train)
y_prd = logisticReg.predict(x_test[['int_memory', 'clock_speed', 'fc', 'four_g', 'm_dep', 'talk_time', 'three_g', 'blue',
                                    'dual_sim', 'n_cores', 'pc', 'sc_h', 'sc_w', 'wifi', 'touch_screen', 'mobile_wt', 'px_width', 'px_height', 'battery_power', 'ram']])

In [93]:
print(classification_report(y_pred, logisticReg.predict(x_test[['int_memory', 'clock_speed', 'fc', 'four_g', 'm_dep', 'talk_time', 'three_g', 'blue',
                                    'dual_sim', 'n_cores', 'pc', 'sc_h', 'sc_w', 'wifi', 'touch_screen', 'mobile_wt', 'px_width', 'px_height', 'battery_power', 'ram']])))