In [1]:
import pandas as pd
import os

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, CategoricalNB

---

# Load data

In [5]:
pwd

'/Users/braulio/Documents/github/FAKE'

In [6]:
CORPUS = []
LABEL = []

In [7]:
# Step 1: get all the fake news text files into a list named corpus, give them label 1 -- fake
dir1 = 'Data/size_normalized_texts/fake/'
list = os.listdir(dir1)
number_files = len(list)

In [8]:
for i in range(number_files):
        title = list[i]
        LABEL.append(1)
        with open(dir1 + title,'r') as reader:

            doc = reader.read()
            doc.lower()
            doc.split()
            reader.close
            CORPUS.append(doc)

In [9]:
assert len(CORPUS) == len(LABEL)

In [10]:
len(CORPUS)

3600

In [11]:
# Step 2: repeat the procedure with true news, give them label 0 -- true

In [12]:
dir2 = 'Data/size_normalized_texts/true/'
list = os.listdir(dir2)
number_files = len(list)

In [13]:
for i in range(number_files):
        title = list[i]
        LABEL.append(0)
        with open(dir2 + title,'r') as reader:

            doc = reader.read()
            doc.lower()
            doc.split()
            reader.close
            CORPUS.append(doc)

In [14]:
assert len(CORPUS) == len(LABEL)

# Vectorize words

customize my stop word list based on the context of the NLP application that you are building.

ex.: portuguse_stop_words = ["a", "o","um","uma","pelo", "pela","ou",...]

In [15]:
def vectors_and_df(corpus, label):
    """creates vectors for songs and returns dataframe with songs as word vectors 
    by all artists"""
    
    cv = TfidfVectorizer()
    cv.fit(corpus)
    corpus_vecs = cv.transform(corpus)
    
    return pd.DataFrame(corpus_vecs.todense(), index=label, 
                        columns=cv.get_feature_names()), cv

In [16]:
# Store results into dataframe, keep cv for later prediction
df, cv = vectors_and_df(CORPUS, LABEL)

# Split and train data

In [17]:
# Define features and target column
X = df
y = df.index

In [18]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [19]:
MODELS = {
    "MultinomialNB": {"alpha": 0.005},
    "CategoricalNB": {"alpha": 0.01},
    "RandomForestClassifier": {
        "n_estimators": 500,
        "max_depth": 200,
        "max_features": "auto",
        "n_jobs": -1,
        "random_state": 1,
    },
    "LogisticRegression": {"C": 1e6},
}

def train_models(models_params):
    """trains models on corpus and returns dataframe with scores"""
    
    scores = {}
    for model in models_params:
        if model == "LogisticRegression":
            m = LogisticRegression(**models_params[model])
        elif model == "RandomForestClassifier":
            m = RandomForestClassifier(**models_params[model])
        elif model == "MultinomialNB":
            m = MultinomialNB(**models_params[model])
        elif model == "CategoricalNB":
            m = MultinomialNB(**models_params[model])

        m.fit(Xtrain, ytrain)
        score_train = m.score(Xtrain, ytrain)
        score_test = m.score(Xtest, ytest)
        scores[f"{model}"] = {
            "params": models_params[model],
            "train score": score_train,
            "test score": score_test,
            }
    return pd.DataFrame(scores).T

In [20]:
train_models(MODELS)

Unnamed: 0,params,train score,test score
MultinomialNB,{'alpha': 0.005},0.98941,0.839583
CategoricalNB,{'alpha': 0.01},0.987674,0.841667
RandomForestClassifier,"{'n_estimators': 500, 'max_depth': 200, 'max_f...",1.0,0.879167
LogisticRegression,{'C': 1000000.0},1.0,0.905556


In [21]:
# Train on most promising model
model = "LogisticRegression"
m = LogisticRegression(**MODELS[model])
m.fit(X, y)
m.score(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


1.0

In [22]:
def predict(new_text):
    
    """
    Takes the pre-trained model pipeline and predicts new artist based on unseen text.
    
    Parameters
    ----------
    model : Trained scikit-learn model pipeline.
    new_text : str
    
    Returns
    ---------
    prediction : str
    
    """
    article = [new_text]
    # transform song into vector matrix
    new_article_vecs = cv.transform(article)
    ynew = new_article_vecs.todense()
    
    prediction = m.predict(ynew)
    
    return prediction[0]

In [23]:
if __name__ == '__main__':
    # Whatever happens after this line, execute it when running "python lyrics_classifier.py"
    # and DO NOT execute these lines of code if things from this script are imported from other scripts.
    
    user_input = input('Please Enter Some Text: ')

    prediction = predict(user_input)
    print('Here is your prediction!')
    print(prediction)

Please Enter Some Text: Mensagem de Robert F. Kennedy, Jr. "Para todos os meus pacientes: Gostaria de chamar sua atenção com urgência para questões importantes relacionadas à próxima vacinação contra Covid-19. Pela primeira vez na história da vacinação, as chamadas vacinas de mRNA de última geração intervêm diretamente no material genético do paciente e, portanto, alteram o material genético individual, que representa a manipulação genética, algo que já foi proibido e até então considerado criminoso. Essa intervenção pode ser comparada à de alimentos geneticamente manipulados, que também é altamente controversa. Mesmo que a mídia e os políticos atualmente banalizem o problema e até mesmo clamem estupidamente por um novo tipo de vacina para voltar à normalidade, essa vacinação é problemática em termos de saúde, moral e ética, e também em termos de danos genéticos que, ao contrário dos danos causados pelas vacinas anteriores, serão irreversíveis e irreparáveis. Caros pacientes, após uma 