In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

In [44]:
# core system imports
import os

import string # Remove punctuation
import pandas as pd # Managing dataframe
import numpy as np # Handling mathematical operations on array
import pickle # library for saving the state of your dataframe
from collections import Counter

from keras import optimizers
from keras.regularizers import l2
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import (Sequential, load_model)

from keras.layers import (
    Dense,
    Dropout,
    LSTM,
    Conv1D, 
    MaxPooling1D, 
    Flatten,
    TimeDistributed
)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from DataPrep.Clean_Texts import clean_text


ModuleNotFoundError: No module named 'keras'

In [45]:
# Location of the pipeline metadata store
_pipeline_root = './pipeline/'

# Directory of the raw data files
_data_root = './data/hausa'

_data_filepath = os.path.join(_data_root, "data.csv")
_stopwords_filepath = os.path.join(_data_root, "stopwords.txt")

In [46]:
# List datasets in directory
os.listdir(_data_root)

['data.csv', 'stopwords.txt']

In [47]:
# Read data from CSV file
data = pd.read_csv(_data_filepath)
data.head()

Unnamed: 0,text,author_id,Label
0,- @aishambuhari ta nemi babban sufeton 'yan sa...,7.97e+17,Neutral
1,"""Duk dan Bokon da baida Ilimin Addini Annoba n...",2290470000.0,Neutral
2,"""Duk mutumin da yayi tunanin bawa mutane ilimi...",1071387000.0,Neutral
3,"""Duk wanda ya sabawa dokar kaucewa kamuwa daga...",1.26e+18,Positive
4,"""Duk wanda ya san ya fito daga yankin da ake A...",1039268000.0,Positive


In [48]:
# Read stop words
stopwords_list = list()

with open(_stopwords_filepath) as file:
    stopwords_list = [line.strip() for line in file.readlines()]

In [49]:
listToStr = ' '.join([str(word) for word in stopwords_list])
print(listToStr)

ta da ya sai ba yi na kuma ma ji cikin in ni wata wani ce tana don za sun amma ga ina ne mai suka wannan a ko lokacin su take shi yake yana ka ban ita tafi


In [60]:
# removing stopwords
def apply_lowercase(text):
    text = [item for item in text if item not in stopwords_list]
    text = ''.join(text)
    return text

# removing stopwords
def stopwords_removal(text):
    text = [item for item in text if item not in stopwords_list]
    text = ''.join(text)
    return text

# remove punctuations
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

In [55]:
data.head()

Unnamed: 0,text,author_id,Label
0,- @aishambuhari ta nemi babban sufeton 'yan sa...,7.97e+17,Neutral
1,"""Duk dan Bokon da baida Ilimin Addini Annoba n...",2290470000.0,Neutral
2,"""Duk mutumin da yayi tunanin bawa mutane ilimi...",1071387000.0,Neutral
3,"""Duk wanda ya sabawa dokar kaucewa kamuwa daga...",1.26e+18,Positive
4,"""Duk wanda ya san ya fito daga yankin da ake A...",1039268000.0,Positive


In [65]:
# Change string to lower case
data['text'] = data['text'].apply(str.lower)
data.sample()

Unnamed: 0,text,author_id,Label
57,bdulrm ymern ki di kje k cigb d wnke hnnu kwi ...,7.54e+17,Positive


In [66]:
# remove punctuations or special characters
data['text'] = data['text'].apply(punctuation_removal)
data.sample()

Unnamed: 0,text,author_id,Label
90,mbinhmzt ci gb d wnke hnnu 😂😂😂😂,9.02e+17,Positive


In [67]:
# remove stopwords
data['text'] = data['text'].apply(stopwords_removal)
data.sample()

Unnamed: 0,text,author_id,Label
58,bdulfthdnn eh we sunce meye n sk msk en\nwe in...,8.99e+17,Neutral


In [36]:
#We will shuffle the data to prevent bias:
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop=True)

#Convert the text to lowercase:
data['text'] = data['text'].apply(lambda x: x.lower())



data['text'] = data['text'].apply(punctuation_removal)

from matplotlib import pyplot as plt

#Data Exploration
#How many articles per subject?
#%matplotlib inline
from matplotlib import pyplot as plt
print(data.groupby(['subject'])['text'].count())
data.groupby(['subject'])['text'].count().plot(kind="bar")
plt.show()
'''
#How many fake and real articles?
print(data.groupby(['target'])['text'].count())
data.groupby(['target'])['text'].count().plot(kind="bar")
plt.show()
'''
#Word Cloud for Negative keywords:
from wordcloud import WordCloud
fake_data = data[data["target"] == "fake"]
all_words = ' '.join([text for text in fake_data.text])
wordcloud = WordCloud(width= 800, height= 500,
                          max_font_size = 110,
                          collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#Word Cloud for real news:
real_data = data[data["target"] == "true"]
all_words = ' '.join([text for text in fake_data.text])
wordcloud = WordCloud(width= 800, height= 500, max_font_size = 110,
 collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
'''
#Most frequent words function:
# Most frequent words counter (Code adapted from https://www.kaggle.com/rodolfoluna/fake-news-detector)   
import nltk
import seaborn as sns
from nltk import tokenize
token_space = tokenize.WhitespaceTokenizer()
def counter(text, column_text, quantity):
    all_words = ' '.join([text for text in text[column_text]])
    token_phrase = token_space.tokenize(all_words)
    frequency = nltk.FreqDist(token_phrase)
    df_frequency = pd.DataFrame({"Word": list(frequency.keys()),
                                   "Frequency": list(frequency.values())})
    df_frequency = df_frequency.nlargest(columns = "Frequency", n = quantity)
    plt.figure(figsize=(12,8))
    ax = sns.barplot(data = df_frequency, x = "Word", y = "Frequency", color = 'blue')
    ax.set(ylabel = "Count")
    plt.xticks(rotation='vertical')
    plt.show()
        
#Most frequent words in fake news:   
counter(data[data["target"] == "data1"], "text", 20)

#Most frequent words in real news:
#counter(data[data["target"] == "true"], "text", 20)
'''
"""
Modeling
The modeling process will consist of vectorizing the corpus stored in the â€œtextâ€ column, then applying TF-IDF, and finally a classification machine learning algorithm. Pretty standard in text analytics and NLP.
For modeling, we have this function to plot the confusion matrix of the models"""
# Function to plot the confusion matrix (code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html)
import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#Split the data:
X_train,X_test,y_train,y_test = train_test_split(data['text'], data.target, test_size=0.2, random_state=42)


#Logistic Regression
# Vectorizing and applying TF-IDF
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])


# Fitting the model
model = pipe.fit(X_train, y_train)
# Accuracy
prediction = model.predict(X_test)
print(" ")
print("Logistic Regression")
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
print(" ")
print(" ")
cm = metrics.confusion_matrix(y_test, prediction)
plot_confusion_matrix(cm, classes=['Fake', 'Real'])

#Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Vectorizing and applying TF-IDF
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = 20, 
                                           splitter='best', 
                                           random_state=42))])
# Fitting the model
model = pipe.fit(X_train, y_train)
# Accuracy
prediction = model.predict(X_test)
print("Deciaion Tree")
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
print(" ")
print(" ")
cm = metrics.confusion_matrix(y_test, prediction)
plot_confusion_matrix(cm, classes=['Fake', 'Real'])

# Random Forest
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', RandomForestClassifier(n_estimators=50, criterion="entropy"))])
model = pipe.fit(X_train, y_train)
prediction = model.predict(X_test)
print("Random Forest")
print(" ")
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
print(" ")
print(" ")

cm = metrics.confusion_matrix(y_test, prediction)
plot_confusion_matrix(cm, classes=['Fake', 'Real'])
'''

















KeyError: 'target'