In [2]:
#import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, RFE, RFECV
from sklearn.decomposition import PCA

In [42]:
#call data 
data = pd.read_csv("E://codes//Python//train.csv")
print("Shape of dataframe: ", data.shape)
data.head()

Shape of dataframe:  (2000, 21)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
# Missing data

missing = data.isnull().sum()
missing[missing > 0]

Series([], dtype: int64)

# 4. Data description

In [43]:
data.dtypes.value_counts()

int64      19
float64     2
dtype: int64

In [44]:
# Standardise all features
original_features = data.drop('price_range', axis = 1)
standard_features = (original_features - original_features.mean()) / original_features.std()
standard_data = pd.concat([data['price_range'], standard_features], axis = 1)

In [45]:
# Divide the standardised features into 3 groups 
feature_1 = standard_data.iloc[:, 1:7]
feature_2 = standard_data.iloc[:, 7: 14]
feature_3 = standard_data.iloc[:, 14:20]

# 5. Exploratory data analysis (EDA)

In [35]:
standard_data.head()

Unnamed: 0,price_range,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,-0.902372,-0.989802,0.830572,-1.018929,-0.762304,-1.043705,-1.380298,0.340654,1.348911,...,-1.305424,-1.408596,-1.146497,0.391605,-0.784787,0.283032,1.462128,-1.786414,-1.005767,0.98585
1,2,-0.495015,1.009798,-1.252751,0.980932,-0.992642,0.957646,1.154735,0.687376,-0.120029,...,-0.645827,0.585631,1.704039,0.4672,1.113987,-0.635158,-0.734084,0.559501,0.993769,-1.013846
2,2,-1.537302,1.009798,-1.252751,0.980932,-0.531966,0.957646,0.493422,1.38082,0.13421,...,-0.645827,1.392336,1.074699,0.441387,-0.310094,-0.864705,-0.368048,0.559501,0.993769,-1.013846
3,2,-1.418964,1.009798,1.198217,-1.018929,-0.992642,-1.043705,-1.21497,1.034098,-0.261274,...,-0.15113,1.286428,1.236662,0.594421,0.87664,0.512579,-0.002013,0.559501,-1.005767,-1.013846
4,1,1.325574,1.009798,-0.394912,-1.018929,2.001753,0.957646,0.658751,0.340654,0.021215,...,0.673365,1.268401,-0.091429,-0.657502,-1.022134,-0.864705,0.730057,0.559501,0.993769,-1.013846


# 5.1 Target variable

In [37]:
# Value counts 

target = data['price_range'] #or we can choose last colom -1
target.value_counts()

1    500
2    500
3    500
0    500
Name: price_range, dtype: int64

In [13]:
feature_mean = pd.concat([target, feature_mean], axis = 1)
feature_mean.head()

Unnamed: 0,price_range,battery_power,blue,clock_speed,dual_sim,fc,four_g
0,1,-0.902372,-0.989802,0.830572,-1.018929,-0.762304,-1.043705
1,2,-0.495015,1.009798,-1.252751,0.980932,-0.992642,0.957646
2,2,-1.537302,1.009798,-1.252751,0.980932,-0.531966,0.957646
3,2,-1.418964,1.009798,1.198217,-1.018929,-0.992642,-1.043705
4,1,1.325574,1.009798,-0.394912,-1.018929,2.001753,0.957646


In [15]:
mean_melt = pd.melt(feature_mean, id_vars = 'price_range', var_name = 'feature', value_name = 'value')
mean_melt.head()

Unnamed: 0,price_range,feature,value
0,1,battery_power,-0.902372
1,2,battery_power,-0.495015
2,2,battery_power,-1.537302
3,2,battery_power,-1.418964
4,1,battery_power,1.325574


In [16]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [17]:
# Train test split 

X_train, X_test, Y_train, Y_test = train_test_split(original_features, target, test_size = 0.3, random_state = 10)
print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)
print("X_test shape: ", X_test.shape)
print("Y_test shape: ", Y_test.shape)

X_train shape:  (1400, 20)
Y_train shape:  (1400,)
X_test shape:  (600, 20)
Y_test shape:  (600,)


In [18]:
# Fit random forest classifier to training set and make predictions on test set
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)

In [20]:
# Evaluate model accuracy 
accuracy = accuracy_score(Y_pred, Y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))


Accuracy: 86.67%


#                                                                  Univariate feature selection

In [26]:
# Train test split

X_train, X_test, Y_train, Y_test = train_test_split(original_features, target, test_size = 0.3, random_state = 10)

In [27]:
# Instantiate select features
select_features = SelectKBest(chi2, k = 5).fit(X_train, Y_train)

# Top 5 features
selected_features = select_features.get_support()
print("Top 5 features: ", list(X_train.columns[selected_features]))

Top 5 features:  ['battery_power', 'mobile_wt', 'px_height', 'px_width', 'ram']


In [28]:
# Apply select features to training and test set
X_train = select_features.transform(X_train)
X_test = select_features.transform(X_test)

# Fit model to data and make predictions
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)

# Evaluate model accuracy 
accuracy = accuracy_score(Y_pred, Y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))


Accuracy: 91.67%
