In [None]:
import pandas as pd
import numpy as np
import random
import time

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Análise Exploratória

In [None]:
smartphones_df = pd.read_csv('/content/drive/MyDrive/tcc/data_smartphones.csv')
smartphones_df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


# Pre-processamento

In [None]:
def get_max_fence(column):
    """Calculates outlier max limit using interquartile range.
       If a given value is greater than max_fence then it's an outlier.
       
    Parameters
    ---------- 
    column : pd.Series
        data to calculate the max limit
        
    Returns
    -------
    float
        calculated outlier limit
    """
    
    quartiles = column.quantile([0.25,0.75])
    first_quartile, third_quartile = quartiles.values[0], quartiles.values[1]
    interquartile_range = third_quartile - first_quartile
    max_fence = third_quartile + 1.5 * interquartile_range
    
    return max_fence

fc_fence = get_max_fence(smartphones_df['fc'])
px_height_fence = get_max_fence(smartphones_df['px_height'])

print(f'Fc max fence: {fc_fence}, Px max fence: {px_height_fence}')

Fc max fence: 16.0, Px max fence: 1944.0


In [None]:
clean_df = smartphones_df.drop(smartphones_df[(smartphones_df.fc >= fc_fence) | (smartphones_df.px_height >= px_height_fence)].index)
clean_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [None]:
model_df = clean_df.drop(columns=['fc', 'px_width', 'sc_h'])
model_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,ram,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,0,7,0.6,188,2,2,20,2549,7,19,0,0,1,1
1,1021,1,0.5,1,1,53,0.7,136,3,6,905,2631,3,7,1,1,0,2
2,563,1,0.5,1,1,41,0.9,145,5,6,1263,2603,2,9,1,1,0,2
3,615,1,2.5,0,0,10,0.8,131,6,9,1216,2769,8,11,1,0,0,2
4,1821,1,1.2,0,1,44,0.6,141,2,14,1208,1411,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,1,2,0.8,106,6,14,1222,668,4,19,1,1,0,0
1996,1965,1,2.6,1,0,39,0.2,187,4,3,915,2032,10,16,1,1,1,2
1997,1911,0,0.9,1,1,36,0.7,108,8,3,868,3057,1,5,1,1,0,3
1998,1512,0,0.9,0,1,46,0.1,145,5,5,336,869,10,19,1,1,1,0


In [None]:
model_df.price_range.value_counts()

0    491
3    490
2    489
1    486
Name: price_range, dtype: int64

# Modelagem
\

In [None]:
random.sample(range(100), 30)

[19,
 42,
 41,
 91,
 38,
 27,
 29,
 59,
 0,
 79,
 97,
 22,
 80,
 14,
 93,
 95,
 77,
 25,
 70,
 53,
 33,
 39,
 2,
 3,
 56,
 40,
 43,
 75,
 52,
 9]

In [None]:
seeds = [19, 42, 41, 91, 38, 27, 29, 59, 0, 79, 97, 22, 80, 14, 93, 
         95, 77, 25, 70, 53, 33, 39,  2, 3, 56, 40, 43, 75, 52, 9]

In [None]:
#separando features e target
X = model_df.drop(columns=['price_range'])
y = model_df['price_range']

#dividindo 70% dos dados para treino e 30% para teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    random_state=33, shuffle=True)

In [None]:
splits = []
names = []
accs = []
f1s = []
prs = []
rcs = []
times = []

classifiers = [
    ('kNN', KNeighborsClassifier(n_neighbors=4)),
    ('SVM', SVC()),
    ('NB', GaussianNB()),
    ('NN', MLPClassifier(random_state=0, solver='lbfgs')),
    ('LR', LogisticRegression()),
    ('RF', RandomForestClassifier(random_state=0))
]


X = model_df.drop(columns=['price_range'])
y = model_df['price_range']

X = StandardScaler().fit_transform(X)


for split, seed in enumerate(seeds):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                      random_state=seed, 
                                                      shuffle=True)

  for name, clf in classifiers:
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()

    training_time = end - start
    times.append(training_time)

    preds = clf.predict(X_test)
    
    splits.append(split)
    names.append(name)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='weighted')
    precision = precision_score(y_test, preds, average='weighted')
    recall = recall_score(y_test, preds, average='weighted')

    accs.append(acc)
    f1s.append(f1)
    prs.append(precision)
    rcs.append(recall)

    print(split, name, acc, f1, precision, recall, training_time)

results = pd.DataFrame()
results['split'] = splits
results['classifier'] = names
results['accuracy'] = accs
results['f1'] = f1s
results['precision'] = prs
results['recall'] = rcs
results['training_time'] = times

0 kNN 0.4684838160136286 0.4687202870602748 0.4794700463574454 0.4684838160136286 0.0004611015319824219
0 SVM 0.8466780238500852 0.8472761011175685 0.8507960163519351 0.8466780238500852 0.1309807300567627
0 NB 0.7836456558773425 0.7857103200031773 0.7889967660799878 0.7836456558773425 0.003443002700805664
0 NN 0.8671209540034072 0.8668926710810794 0.8669289680588744 0.8671209540034072 0.3657186031341553
0 LR 0.8943781942078365 0.8945523724998318 0.8948619934040708 0.8943781942078365 0.04786276817321777
0 RF 0.838160136286201 0.8395415704825141 0.8416335135245062 0.838160136286201 0.33670592308044434
1 kNN 0.48211243611584326 0.47862444026075573 0.4953649021150077 0.48211243611584326 0.00044608116149902344
1 SVM 0.8500851788756388 0.8502312413569134 0.8505012919475397 0.8500851788756388 0.12523722648620605
1 NB 0.787052810902896 0.7869839178229975 0.7869631069553032 0.787052810902896 0.004472494125366211
1 NN 0.8722316865417377 0.872130988673581 0.8721801504819839 0.8722316865417377 0.2

In [None]:
results

Unnamed: 0,split,classifier,accuracy,f1,precision,recall,training_time
0,0,kNN,0.468484,0.468720,0.479470,0.468484,0.000461
1,0,SVM,0.846678,0.847276,0.850796,0.846678,0.130981
2,0,NB,0.783646,0.785710,0.788997,0.783646,0.003443
3,0,NN,0.867121,0.866893,0.866929,0.867121,0.365719
4,0,LR,0.894378,0.894552,0.894862,0.894378,0.047863
...,...,...,...,...,...,...,...
175,29,SVM,0.836457,0.836481,0.836729,0.836457,0.125781
176,29,NB,0.800681,0.801484,0.802506,0.800681,0.004636
177,29,NN,0.855196,0.854912,0.855084,0.855196,0.267754
178,29,LR,0.889267,0.889065,0.888936,0.889267,0.048502


In [None]:
results.to_csv('/content/drive/MyDrive/tcc/results.csv', index=False)