In [182]:
import pandas as pd
from sklearn.feature_selection import chi2
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [183]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [184]:
# Getting the shape of train and test data
train_data.shape
test_data.shape

(1000, 20)

In [185]:
test_data.shape

(1000, 20)

In [186]:
# Checking the class distribution
train_data.price_range.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [187]:
# Finding the missing values
train_data.isna().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [188]:
# Dimensionaltiy reduction


# Encode categorical columns if necessary
X = train_data.drop('price_range', axis=1)
y = train_data['price_range']
chi_scores, p_values = chi2(X, y)
feature_importance = pd.Series(chi_scores, index=X.columns)

chi2_results = pd.DataFrame({
    'Feature': X.columns,
    'Chi2_Score': chi_scores,
    'P_Value': p_values
}).sort_values(by='Chi2_Score', ascending=False)



chi2_threshold = 0.05  # Minimum chi-squared score
p_value_threshold = 0.05  # Maximum acceptable p-value
important_features = chi2_results[
    (chi2_results['Chi2_Score'] > chi2_threshold) &
    (chi2_results['P_Value'] < p_value_threshold)
]['Feature']


X = X[important_features]

In [189]:
# Breaking the training data into train and test for model development

In [190]:
# # Detecting and removing outliers using Z Score


# # Apply Z-Score method
# threshold = 3
# z_scores = X.apply(zscore)

# # Identify outliers
# outliers_z = (z_scores.abs() > threshold)
# outliers_df = X[outliers_z.any(axis=1)]  # Rows where any column has an outlier

# X = X[(z_scores.abs() <= threshold).all(axis=1)]

In [191]:
X.shape

(2000, 12)

In [192]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [193]:
# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

Shape of training feature: (1600, 12)
Shape of testing feature: (400, 12)
Shape of training label: (1600,)
Shape of training label: (400,)


In [196]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred, average='macro')
    rec = metrics.recall_score(y_test, y_pred, average='macro')
    f1 = metrics.f1_score(y_test, y_pred, average='macro')
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # # Calculate area under curve (AUC)
    # y_pred_proba = model.predict_proba(x_test)[::,1]
    # fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    # auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 'cm': cm}

In [200]:
# Building Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train,y_train)
    evaluation = evaluate_model(model, X_test, y_test)
    print(f'For Model : {name}')
    print(evaluation)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


For Model : Logistic Regression
{'acc': 0.635, 'prec': 0.6477057747291239, 'rec': 0.6355145516058992, 'f1': 0.6376238603761414, 'kappa': np.float64(0.5132562655753827), 'cm': array([[69, 23,  1,  0],
       [13, 52, 34,  8],
       [ 0, 14, 52, 24],
       [ 0,  0, 29, 81]])}
For Model : Random Forest
{'acc': 0.9025, 'prec': 0.9018153236573496, 'rec': 0.9023456604026396, 'f1': 0.9018665826943368, 'kappa': np.float64(0.8697786236602224), 'cm': array([[90,  3,  0,  0],
       [ 3, 96,  8,  0],
       [ 0,  6, 76,  8],
       [ 0,  0, 11, 99]])}
For Model : SVM
{'acc': 0.9575, 'prec': 0.9579199735449736, 'rec': 0.956147851746284, 'f1': 0.9566707387878197, 'kappa': np.float64(0.9431609214617673), 'cm': array([[ 92,   1,   0,   0],
       [  4, 103,   0,   0],
       [  0,   4,  81,   5],
       [  0,   0,   3, 107]])}
For Model : Naive Bayes
{'acc': 0.8475, 'prec': 0.852814964590375, 'rec': 0.8499384103318381, 'f1': 0.8485587547140727, 'kappa': np.float64(0.7968072083475596), 'cm': array([

In [210]:
# From above we can see that SVM is the best performing model
svm = SVC()
svm.fit(X_train, y_train)

# For SVM (support vector machines), we can access decision function values
decision_values = svm.decision_function(X_test)

# Print decision function values for first 10 samples
print("Decision function values:", abs(decision_values[:10]))


Decision function values: [[3.23524796 2.27228418 0.86710269 0.2870077 ]
 [2.18906505 3.28546928 1.20821515 0.29979483]
 [3.26965112 2.28686089 0.84896711 0.30182303]
 [0.74436303 3.274159   2.25418042 0.27331789]
 [0.29650972 0.74481727 2.26564386 3.29361762]
 [0.28397133 2.22212174 3.27898903 0.80682942]
 [2.21273226 3.28737816 1.17309134 0.30032207]
 [0.29582609 0.76243667 2.27056565 3.28609988]
 [0.30128365 0.74091581 2.27402368 3.29737384]
 [1.88882013 3.27910568 1.1915854  0.28570104]]


In [211]:
y_test

1967    1
1332    1
1263    0
516     1
1491    3
       ..
1511    1
1821    3
1476    2
367     3
1372    2
Name: price_range, Length: 400, dtype: int64

In [214]:
# Saving the model
import joblib
joblib.dump(model, 'svm_mobile_classification.joblib')

['svm_mobile_classification.joblib']

In [215]:
# Loading the model
import joblib

# Load the saved model from the file
model = joblib.load('svm_mobile_classification.joblib')

print("Model loaded successfully!")

Model loaded successfully!
