In [1]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
)
from sklearn.model_selection import (
    ShuffleSplit,
    cross_validate,
    RandomizedSearchCV,
)
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn import metrics

from nltk.tokenize import TweetTokenizer


from sklearn import svm

nltk.download("stopwords")


[nltk_data] Downloading package stopwords to /home/marvin-
[nltk_data]     linux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Carregar o dataframe

In [2]:
df = pd.read_csv(
    "../data/raw/NoThemeTweets.csv", usecols=["tweet_text", "sentiment"]
).sample(9000, random_state=42)

# x_train, x_test, y_train, y_test = train_test_split(df.tweet_text,df.sentiment,test_size=0.2, random_state=42)

# len(x_train)


In [3]:
# corpus_train = df.tweet_text[2000:].to_list()
# labels_train = df.sentiment[2000:].replace({"Positivo": 1, "Negativo": 0}).to_list()

corpus_train = df.tweet_text.to_list()
labels_train = df.sentiment.replace({"Positivo": 1, "Negativo": 0}).to_list()

# corpus_test = x_test.to_list()
# labels_test = y_test.replace({"Positivo": 1, "Negativo": 0}).to_list()

# corpus_test = df.tweet_text[:2000].to_list()
# labels_test = df.sentiment[:2000].replace({"Positivo": 1, "Negativo": 0}).to_list()

stop_words = nltk.corpus.stopwords.words("portuguese") + ["https"] + ["co"]


In [4]:
# splited_dataset = ShuffleSplit(n_splits=10, test_size=0.2)

# splited_dataset


In [5]:
models = {
    "KNN": {
        "model_obj": KNeighborsClassifier(),
        "hyperparameters": {
            "n_neighbors": [7, 11, 21],
            "weights": ["uniform", "distance"],
        },
    },
    "SMV": {
        "model_obj": svm.SVC(),
        "hyperparameters": {
            "kernel": ["linear", "rbf"],
            "C": [0.1, 0.5, 1, 5, 10],
        },
    },
    "GaussianNB": {
        "model_obj": GaussianNB(),
        "hyperparameters": {
            "var_smoothing": [
                1e-8,
                1e-6,
                1e-4,
                1e-2,
            ]
        },
    },
}

tweet_tokenizer = TweetTokenizer()

vectorizers = {
    "TfidfVectorizer": {
        "vectorizer_obj": TfidfVectorizer(),
        "hyperparameters": {
            "max_features": [500, 1000, 2000],
            "analyzer": ["word", "char"],
            "stop_words": [stop_words, None],
            "tokenizer": [tweet_tokenizer.tokenize, None],
        },
    },
    "CountVectorizer": {
        "vectorizer_obj": CountVectorizer(),
        "hyperparameters": {
            "max_features": [500, 1000, 2000],
            "analyzer": ["word", "char"],
            "stop_words": [stop_words, None],
            "tokenizer": [tweet_tokenizer.tokenize, None],
        },
    },
}

normalizers = {
    "PCA": {
        "normalizer_obj": TruncatedSVD(),
        "hyperparameters": {
            "n_components": [10, 30, 50, 75],
        },
    }
}

scalers = {
    "Scaler": {
        "scaler_obj": StandardScaler(),
        "hyperparameters": {},
    }
}


In [6]:
n_splits_cv = 2
n_splits_gs = 2

all_scores = {}

split_cv = ShuffleSplit(n_splits=n_splits_cv, test_size=0.2)
for model_name, model_data in models.items():

    model_params = {
        f"model__{key}": value for key, value in model_data["hyperparameters"].items()
    }

    for vectorizer_name, vectorizer_data in vectorizers.items():

        vectorize_params = {
            f"vectorizer__{key}": value
            for key, value in vectorizer_data["hyperparameters"].items()
        }

        for normalizer_name, normalizer_data in normalizers.items():

            normalizer_params = {
                f"normalizer__{key}": value
                for key, value in normalizer_data["hyperparameters"].items()
            }

            for scaler_name, scaler_data in scalers.items():

                scaler_params = {
                    f"scaler__{key}": value
                    for key, value in scaler_data["hyperparameters"].items()
                }

                param_distributions = {
                    **model_params,
                    **vectorize_params,
                    **normalizer_params,
                    **scaler_params,
                }

                pipeline = Pipeline(
                    steps=[
                        ("vectorizer", vectorizer_data["vectorizer_obj"]),
                        ("normalizer", normalizer_data["normalizer_obj"]),
                        ("scaler", scaler_data["scaler_obj"]),
                        ("model", model_data["model_obj"]),
                    ]
                )

                approach_name = (
                    f"{model_name}__{vectorizer_name}__{normalizer_name}__{scaler_name}"
                )

                print(f"Fiting best model to \n{approach_name}", end="\n\n")

                tuned_pipeline = RandomizedSearchCV(
                    pipeline,
                    param_distributions,
                    scoring="f1",
                    cv=n_splits_gs,
                )

                scores = cross_validate(
                    tuned_pipeline,
                    corpus_train,
                    labels_train,
                    cv=split_cv,
                    scoring=["accuracy", "f1", "recall"],
                )

                all_scores.update(
                    {
                        approach_name: {
                            "scores": scores,
                            "tuned_pipeline": tuned_pipeline,
                        }
                    }
                )


Fiting best model to 
KNN__TfidfVectorizer__PCA__Scaler

Fiting best model to 
KNN__CountVectorizer__PCA__Scaler

Fiting best model to 
SMV__TfidfVectorizer__PCA__Scaler

Fiting best model to 
SMV__CountVectorizer__PCA__Scaler

Fiting best model to 
GaussianNB__TfidfVectorizer__PCA__Scaler

Fiting best model to 
GaussianNB__CountVectorizer__PCA__Scaler



In [7]:
all_scores


{'KNN__TfidfVectorizer__PCA__Scaler': {'scores': {'fit_time': array([14.8744247 , 10.98238444]),
   'score_time': array([0.28571343, 0.21477246]),
   'test_accuracy': array([0.98      , 0.97833333]),
   'test_f1': array([0.96721311, 0.96719933]),
   'test_recall': array([0.95848375, 0.95993322])},
  'tuned_pipeline': RandomizedSearchCV(cv=2,
                     estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                               ('normalizer', TruncatedSVD()),
                                               ('scaler', StandardScaler()),
                                               ('model',
                                                KNeighborsClassifier())]),
                     param_distributions={'model__n_neighbors': [7, 11, 21],
                                          'model__weights': ['uniform',
                                                             'distance'],
                                          'normalizer__n_compon

In [8]:
all_scores

corpus_name = "NoThemeTweets"
approach_names = []
fit_times = []
scores_times = []
accuracy_means = []
f1_scores_mean = []
recall_scores_mean = []


for approach_name, score in all_scores.items():
    print(f"{approach_name}")
    print(f"{score['scores']}")
    approach_names.append(approach_name)
    fit_times.append(score["scores"]["fit_time"].mean())
    scores_times.append(score["scores"]["score_time"].mean())
    accuracy_means.append(score["scores"]["test_accuracy"].mean())
    f1_scores_mean.append(score["scores"]["test_f1"].mean())
    recall_scores_mean.append(score["scores"]["test_recall"].mean())
    print("\n")


# data={'Name':['Karan','Rohit','Sahil','Aryan'],'Age':[23,22,21,24]}


test_data = data = {
    "approach": approach_names,
    "fit_time": fit_times,
    "score_time": scores_times,
    "accuracy": accuracy_means,
    "f1": f1_scores_mean,
    "recall": recall_scores_mean,
}


test_data_df = pd.DataFrame(test_data)


test_data_df


KNN__TfidfVectorizer__PCA__Scaler
{'fit_time': array([14.8744247 , 10.98238444]), 'score_time': array([0.28571343, 0.21477246]), 'test_accuracy': array([0.98      , 0.97833333]), 'test_f1': array([0.96721311, 0.96719933]), 'test_recall': array([0.95848375, 0.95993322])}


KNN__CountVectorizer__PCA__Scaler
{'fit_time': array([15.63183713, 13.732867  ]), 'score_time': array([0.18967867, 0.22653437]), 'test_accuracy': array([0.97833333, 0.985     ]), 'test_f1': array([0.96758105, 0.97813765]), 'test_recall': array([0.97487437, 0.97734628])}


SMV__TfidfVectorizer__PCA__Scaler
{'fit_time': array([11.36878777, 23.89521456]), 'score_time': array([0.0521903 , 0.05033612]), 'test_accuracy': array([0.99777778, 0.99722222]), 'test_f1': array([0.99669967, 0.9958368 ]), 'test_recall': array([0.99505766, 0.99500832])}


SMV__CountVectorizer__PCA__Scaler
{'fit_time': array([42.36956191, 41.74280381]), 'score_time': array([0.19951916, 0.04632664]), 'test_accuracy': array([0.98777778, 0.99944444]), 't

Unnamed: 0,approach,fit_time,score_time,accuracy,f1,recall
0,KNN__TfidfVectorizer__PCA__Scaler,12.928405,0.250243,0.979167,0.967206,0.959208
1,KNN__CountVectorizer__PCA__Scaler,14.682352,0.208107,0.981667,0.972859,0.97611
2,SMV__TfidfVectorizer__PCA__Scaler,17.632001,0.051263,0.9975,0.996268,0.995033
3,SMV__CountVectorizer__PCA__Scaler,42.056183,0.122923,0.993611,0.989643,0.991905
4,GaussianNB__TfidfVectorizer__PCA__Scaler,6.9475,0.151297,0.933056,0.896231,0.886869
5,GaussianNB__CountVectorizer__PCA__Scaler,5.323689,0.149068,0.925833,0.889889,0.941506


In [9]:
x_train, x_test, y_train, y_test = train_test_split(df.tweet_text,df.sentiment,test_size=0.2, random_state=42)

corpus_train2 = x_train.to_list()
labels_train2 = y_train.replace({"Positivo": 1, "Negativo": 0}).to_list()

corpus_test2 = x_test.to_list()
labels_test2 = y_test.replace({"Positivo": 1, "Negativo": 0}).to_list()


all_scores["SMV__TfidfVectorizer__PCA__Scaler"]["tuned_pipeline"].fit(corpus_train2, labels_train2)

print("______" * 30)
print(all_scores["SMV__TfidfVectorizer__PCA__Scaler"]["tuned_pipeline"].best_params_)

In [None]:
y_hat = all_scores["SMV__TfidfVectorizer__PCA__Scaler"]["tuned_pipeline"].predict(corpus_test2)

y_hat

In [None]:
accuracy_score(labels_test2, y_hat)

In [None]:
f1_score(labels_test2, y_hat)

In [None]:
a = all_scores[" KNN + TfidfVectorizer + PCA + Scaler"]["pipeline"].best_params_
a


In [None]:
n_splits_cv = 2
n_splits_gs = 2

all_scores = {}

split_cv = ShuffleSplit(n_splits=n_splits_cv, test_size=0.2)

for model_name, model_data in models.items():

    model_params = {
        f"model__{key}": value for key, value in model_data["hyperparameters"].items()
    }
    print(model_params)

    param_distributions = {
        "vectorizer__tfidf__use_idf": [False, True],
        "vectorizer__count__max_features": [1000, 2000],
        "pca__n_components": [100, 200, 500],
        **model_params,
    }

    print(param_distributions)

    pipeline = Pipeline(
        steps=[
            ("vectorizer", vectorizer),
            ("pca", pca),
            ("normalize", scaler),
            ("model", model_data["model_obj"]),
        ]
    )

    gs_model = RandomizedSearchCV(
        pipeline,
        param_distributions,
        scoring="f1",
        cv=n_splits_gs,
    )

    scores = cross_validate(
        gs_model, corpus, labels, cv=split_cv, scoring=["accuracy", "f1"]
    )

    all_scores.update({model_name: scores})

scores


In [None]:
all_scores



In [None]:
param_distributions = {
    "vectorizer__n_components": [100, 200, 500, 1000],
    **model_data["hyperparameters"],
}

param_distributions


In [None]:
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()

pipeline = Pipeline(
    steps=[
        (
            "vectorizer",
            CountVectorizer(analyzer="word", stop_words=stop_words, max_features=1000),
        ),
        ("pca", TruncatedSVD(500)),
        ("clf", KNeighborsClassifier(n_neighbors=11)),
    ]
)

pipeline2 = Pipeline(
    steps=[
        (
            "vectorizer",
            CountVectorizer(
                analyzer="word",
                stop_words=stop_words,
                max_features=1000,
                tokenizer=tweet_tokenizer.tokenize,
            ),
        ),
        ("pca", TruncatedSVD(500)),
        ("clf", KNeighborsClassifier(n_neighbors=11)),
    ]
)


pipeline3 = Pipeline(
    steps=[
        (
            "vectorizer",
            CountVectorizer(
                analyzer="word",
                stop_words=stop_words,
                max_features=1000,
                tokenizer=tweet_tokenizer.tokenize,
            ),
        ),
        ("clf", MultinomialNB()),
    ]
)


pipeline4 = Pipeline(
    steps=[
        (
            "vectorizer",
            CountVectorizer(
                analyzer="word",
                stop_words=stop_words,
                max_features=1000,
                tokenizer=tweet_tokenizer.tokenize,
            ),
        ),
        ("clf", svm.SVC(kernel="linear")),
    ]
)


In [None]:
pipeline.fit(corpus_train, labels_train)
pipeline2.fit(corpus_train, labels_train)
pipeline3.fit(corpus_train, labels_train)
pipeline4.fit(corpus_train, labels_train)


In [None]:
# pipeline3.fit(corpus_train, labels_train)
# y_hat3 = pipeline3.predict(corpus_test)
# y_hat3


In [None]:
y_hat = pipeline.predict(corpus_test)
y_hat2 = pipeline2.predict(corpus_test)
y_hat3 = pipeline3.predict(corpus_test)
y_hat4 = pipeline4.predict(corpus_test)


In [None]:
accuracy_score(labels_test, y_hat)


In [None]:
accuracy_score(labels_test, y_hat2)


In [None]:
accuracy_score(labels_test, y_hat3)


In [None]:
accuracy_score(labels_test, y_hat4)


In [None]:
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()


cv = CountVectorizer(
    analyzer="word",
    stop_words=stop_words,
    max_features=1500,
    tokenizer=tweet_tokenizer.tokenize,
)

freq_train = cv.fit_transform(corpus_train)


freq_train


In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(freq_train, labels_train)

model


freq_train


In [None]:
teste = [
    "Não é um bom dia para começar a programar",
    "Estou ficando muito cansado",
    "Não consigo fazer nada",
    "Que dia lindo, hoje é dia de programar",
    "É coisa, é coisa pura, é coisa maravilhosa",
    "Bolsonaro é um cara horrível",
    "Estou querendo morrer",
    "Abençoa senhor a minha vida bolsonaro",
]


In [None]:
resultados = cross_val_predict(gs_model, corpus, labels, cv=10)


In [None]:
resultados2 = cross_val_score(pipeline4, corpus_train, labels_train, cv=10)


In [None]:
metrics.accuracy_score(labels_train, resultados)


In [None]:
resultados3 = cross_validate(pipeline4, corpus_train, labels_train, cv=10)


In [None]:
resultados3
