# Classificador de marcas de celular
Após tentativas de tratar a coluna MarcaCelular utilizando expressões regulares, foi decidido utilizar um classificador baseado em aprendizado de maquina para as marcas de celular.


In [None]:
import subprocess
subprocess.check_call(["pip", "install", "scikit-learn"])
subprocess.check_call(["pip", "install", "pyodbc"])
subprocess.check_call(["pip", "install", "matplotlib"])

In [34]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import pyodbc
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt

## Acessando o banco e selecionando os dados

In [67]:
load_dotenv()
servidor = os.getenv("SERVIDOR")
banco = os.getenv("BANCO")
driver = os.getenv("DRIVER")

str_conexao = f"DRIVER={driver};SERVER={servidor};DATABASE={banco};Trusted_Connection=yes;"

try:
    conexao = pyodbc.connect(str_conexao)

    consulta = """
    SELECT
        [marcaCelular]
    ,   COUNT(1) AS [volume]
    FROM [PortalTransparencia].[Consolidado].[Boletins]
    WHERE [marcaCelular] IS NOT NULL
    AND [quantidadeCelular] = 1
    AND [dataOcorrencia] >= '2018-01-01'
    GROUP BY [marcaCelular]
    """

    df = pd.read_sql(consulta, conexao)

except Exception as e:
    print(f"Erro de conexão: {str(e)}")

finally:
    if conexao:
        conexao.close()


  df = pd.read_sql(consulta, conexao)


In [37]:
df.head()

Unnamed: 0,marcaCelular,volume
0,SAMSUNG J 7 METAL |,2
1,SAMSUNG J2 PRIME TV 16GB|,1
2,SANSUNG J500M DS SMART PRETO|,1
3,APPLE - IPHONE 7PLUS 256GB|,1
4,IPHONE 7 PLUS PRETO|,7


## Criando o classificador

In [56]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['marcaCelular'])

kmeans = KMeans(n_clusters=7, random_state=0)
kmeans.fit(X)


  super()._check_params_vs_input(X, default_n_init=10)


In [57]:
df['label'] = kmeans.labels_

df.head(20)

Unnamed: 0,marcaCelular,volume,label
0,SAMSUNG J 7 METAL |,2,2
1,SAMSUNG J2 PRIME TV 16GB|,1,2
2,SANSUNG J500M DS SMART PRETO|,1,6
3,APPLE - IPHONE 7PLUS 256GB|,1,4
4,IPHONE 7 PLUS PRETO|,7,4
5,SAMSUNG GALAXY A7 |,2,2
6,WAIW|,1,0
7,"ASUS, ZEN PHONE 4 SELFIE|",1,3
8,SANSUNG G532 GAL.J2 PRIME 16G DUOS TV DR 35695...,1,6
9,SAMSUNG GALAXY J5 PRIME RS G570|,1,2


In [60]:
print(X)

  (0, 2603)	1
  (0, 1896)	1
  (1, 2603)	1
  (1, 1544)	1
  (1, 2406)	1
  (1, 2919)	1
  (1, 94)	1
  (2, 2670)	1
  (2, 1565)	1
  (2, 968)	1
  (2, 2744)	1
  (2, 2398)	1
  (3, 648)	1
  (3, 1492)	1
  (3, 459)	1
  (3, 163)	1
  (4, 2398)	1
  (4, 1492)	1
  (4, 2346)	1
  (5, 2603)	1
  (5, 1254)	1
  (5, 564)	1
  (6, 2988)	1
  (7, 661)	1
  (7, 3233)	1
  :	:
  (12906, 2384)	1
  (12906, 867)	1
  (12907, 661)	1
  (12907, 3239)	1
  (12907, 982)	1
  (12907, 1333)	1
  (12907, 479)	1
  (12907, 714)	1
  (12907, 809)	1
  (12907, 276)	1
  (12907, 2865)	1
  (12907, 3196)	1
  (12908, 2046)	1
  (12909, 1254)	1
  (12909, 2829)	1
  (12910, 3046)	1
  (12910, 2487)	1
  (12911, 1254)	1
  (12911, 1580)	1
  (12912, 2919)	1
  (12912, 2758)	1
  (12912, 1762)	1
  (12912, 1648)	1
  (12912, 956)	1
  (12912, 2695)	1


In [66]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(df['marcaCelular'], df['marcaCelular'], test_size=0.2, random_state=0)

# Vetorizar os dados de texto
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Treinar o modelo Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

# Fazer previsões
y_pred = nb_classifier.predict(X_test_vec)

# Avaliar o desempenho do modelo
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Acurácia do modelo: {accuracy}')

Acurácia do modelo: 0.0


In [None]:
import numpy as np

