In [None]:
%run "Funciones.py"

## Probando con Naive Bayes y Árboles
---

In [None]:
true = pd.read_csv("Data/True.csv")
false = pd.read_csv("Data/Fake.csv")

In [None]:
true['category'] = 1
false['category'] = 0

In [None]:
df = pd.concat([true,false])

In [None]:
#Combinamos columnas de texto
df['text'] = df['text'] + " " + df['title']
del df['title']
del df['subject']
del df['date']

---

### Realizamos una limpieza básica para luego probar con diferentes modelos

In [None]:
# Todo en lower, Html parser, Borrar corchetes, Borrar urls, Borrar stopwords
df['text'] = df['text'].apply(denoise_text)

---

### Separación de sets para entrenamiento y testeo

In [None]:
train, test, y_train, y_test = train_test_split(df.text, df.category, stratify=df.category, random_state = 0)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

---

In [None]:
# Se cargan las stopwords en inglés
stop_words = stopwords.words('english');

In [None]:
# DataFrame para los resultados
resultados = pd.DataFrame()

---

In [None]:
# GridSearch general
skf = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
param_grid = {'alpha': np.arange(0.05, 1, 0.05)};
grid = GridSearchCV(MultinomialNB(), param_grid, cv=skf, verbose=0);

### Modelo 1: CountVectorizer + MultinomialNB

In [None]:
vectorizer = CountVectorizer(stop_words=stop_words, strip_accents='unicode')
resultados = ejecutarModelo1(
    'M1: CountVectorizer + MultinomialNB', 
    grid, 
    vectorizer, 
    train, 
    test, 
    y_train, 
    y_test, 
    resultados)

In [None]:
resultados

---

### Modelo 2: TfidfTransformer + MultinomialNB

In [None]:
resultados = ejecutarModelo1(
    'M2: TfidfTransformer + MultinomialNB', 
    grid, 
    vectorizer, 
    train, 
    test, 
    y_train, 
    y_test, 
    resultados,
    TfidfTransformer())

In [None]:
resultados

---

### Modelo 3: TfidfVectorizer + MultinomialNB

In [None]:
resultados = ejecutarModelo1(
    'M3: TfidfVectorizer + MultinomialNB', 
    grid, 
    TfidfVectorizer(stop_words=stop_words, strip_accents='unicode'), 
    train, 
    test, 
    y_train, 
    y_test, 
    resultados)

In [None]:
resultados

---

### Modelo 4: TfidfVectorizer (bigramas) + MultinomialNB

In [None]:
resultados = ejecutarModelo1(
    'M4: TfidfVectorizer (bigramas) + MultinomialNB', 
    grid, 
    TfidfVectorizer(stop_words=stop_words, strip_accents='unicode', ngram_range=(2,2)), 
    train, 
    test, 
    y_train, 
    y_test, 
    resultados)

In [None]:
resultados

---

### Modelo 5: ExtraTreesClassifier

In [None]:
X_train = vectorizer.fit_transform(train)
X_test = vectorizer.transform(test)

In [None]:
resultados, importancia = ejecutarModelo2(
    'M5: ExtraTreesClassifier', 
    ExtraTreesClassifier(n_estimators=5, random_state=42), 
    vectorizer, 
    X_train, 
    X_test,
    y_train, 
    y_test, 
    resultados);

In [None]:
resultados

In [None]:
importancia.to_csv('Data/Importancia1.csv')

---

### Modelo 6: AdaBoostClassifier + DecisionTreeClassifier

In [None]:
resultados, importancia = ejecutarModelo2(
    'M6: AdaBoostClassifier + DecisionTreeClassifier', 
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, random_state=42), n_estimators=5, random_state=42), 
    vectorizer, 
    X_train, 
    X_test,
    y_train, 
    y_test, 
    resultados);

In [None]:
resultados

In [None]:
importancia.to_csv('Data/Importancia2.csv')

---

### Modelo 7: RandomForestClassifier

In [None]:
resultados, importancia = ejecutarModelo2(
    'M7: RandomForestClassifier', 
    RandomForestClassifier(n_estimators=5, random_state=42), 
    vectorizer, 
    X_train, 
    X_test,
    y_train, 
    y_test, 
    resultados);

In [None]:
resultados

In [None]:
importancia.to_csv('Data/Importancia3.csv')

---

### Modelo 8: LogisticRegression

In [None]:
lr = LogisticRegression(C=1e5, random_state=42)
lr.fit(X_train, y_train)

In [None]:
resultados = agregarResultado(
    resultados,
    'M8: LogisticRegression', 
    lr.score(X_train, y_train),
    lr.score(X_test, y_test))

---

In [None]:
resultados

In [None]:
plt.figure(figsize=(10, 10))
fig = sns.barplot(
    data=resultados, 
    y="Modelo", 
    x="Accuracy", 
    hue="Set", orient='h')
fig.get_figure().savefig('Graficos/06_ModelosClasicos.png', bbox_inches='tight')