In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib as plt

# Load the dataset
train_data = pd.read_csv('../dataset/cleaned_train.csv')
train_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,touch_screen,wifi,price_range,screen_area,log_ram,log_battery_power,log_int_memory,battery_power_bins,ram_bins,int_memory_bins
0,-0.902597,0,0.830779,0,-0.764629,0.0,-1.382405,0.339276,1.350676,-1.101463,...,0,1,1,63.0,7.843849,6.736967,2.079442,0,2,0
1,-0.495139,1,-1.253064,1,-0.995615,1.0,1.156334,0.686381,-0.120727,-0.664034,...,1,0,2,51.0,7.875499,6.929517,3.988984,1,2,3
2,-1.537686,1,-1.253064,1,-0.533642,1.0,0.494054,1.380591,0.133939,0.210825,...,1,0,2,22.0,7.864804,6.335054,3.73767,0,2,2
3,-1.419319,1,1.198517,0,-0.995615,0.0,-1.216835,1.033486,-0.262208,0.648255,...,0,0,2,128.0,7.926603,6.423247,2.397895,0,2,0
4,1.325906,1,-0.395011,0,2.007209,1.0,0.659624,0.339276,0.020754,-1.101463,...,1,0,1,16.0,7.252762,7.50769,3.806662,3,1,2


In [2]:
nan_values = train_data.isna().sum()

# Print columns with NaN values and their counts
print("Columns with NaN values:\n", nan_values[nan_values > 0])

Columns with NaN values:
 Series([], dtype: int64)


In [3]:
from sklearn.model_selection import train_test_split

X = train_data.drop(['price_range'], axis=1)
y = train_data['price_range']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Let's start with simple algorithms like Logistic Regression, Decision Trees, and then move to more complex ones like Random Forest, 

In [4]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()

log_clf.fit(X_train, y_train)

print(
    "Accuracy of Logistic regression classifier on train set:",
    log_clf.score(X_train, y_train),
)
print(
    "Accuracy of Logistic regression classifier on test set:",
    log_clf.score(X_val, y_val),
)

Accuracy of Logistic regression classifier on train set: 0.908125
Accuracy of Logistic regression classifier on test set: 0.895


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Split the data
X = train_data.drop(['price_range'], axis=1)
y = train_data['price_range']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = model.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[100   5   0   0]
 [  5  81   5   0]
 [  0  11  74   7]
 [  0   0  13  99]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       105
           1       0.84      0.89      0.86        91
           2       0.80      0.80      0.80        92
           3       0.93      0.88      0.91       112

    accuracy                           0.89       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.89      0.89      0.89       400



In [6]:
from sklearn.svm import SVC

C = 1.0
np.random.seed(42)

clfc = SVC(C=C, kernel="linear", gamma=2.5).fit(X_train, y_train)
print(
    "Accuracy of RBF-kernel SVC on training set: {:.2f}".format(
        clfc.score(X_train, y_train)
    )
)
print("Accuracy of RBF-kernel SVC on test set: {:.2f}".format(clfc.score(X_val, y_val)))
clfc

Accuracy of RBF-kernel SVC on training set: 0.97
Accuracy of RBF-kernel SVC on test set: 0.96


## Model Optimization

In [7]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define SVM parameters
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': [1, 2.5, 2, 0.001],  # Kernel coefficient
    'kernel': ['rbf', 'linear']  # Radial Basis Function kernel
}


# Define GridSearchCV
svc = SVC()
clf = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

# Perform hyperparameter search
search = clf.fit(X_train, y_train)

# Print best parameters and elapsed time
print(f"Best Parameters: {search.best_params_}")


Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Parameters: {'C': 100, 'gamma': 1, 'kernel': 'linear'}


In [8]:
clf.best_score_

0.96125

In [9]:
y_preds = clf.predict(X_val)
print(confusion_matrix(y_val, y_preds))
print(classification_report(y_val, y_preds))

[[102   3   0   0]
 [  0  91   0   0]
 [  0   1  89   2]
 [  0   0   1 111]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99       105
           1       0.96      1.00      0.98        91
           2       0.99      0.97      0.98        92
           3       0.98      0.99      0.99       112

    accuracy                           0.98       400
   macro avg       0.98      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400



In [10]:
import joblib

# Save the best model
joblib.dump(clf, '../model/SVC_model.pkl')

['../model/SVC_model.pkl']