## Binary Classification
The pupose of this module is to implement the ML classification model that automates the task of determining wether a tweet is related or not to the SDGs. In order to do this, we will be using a preprocessed and labeled dataset `binarySDG_proc.csv`.

In [1]:
# imports
import csv
import pandas as pd
import re
import joblib # to store picke

# Classification
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import LabelEncoder
#from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import metrics

In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# load dataset
df =pd.read_csv("drive/My Drive/TFG/Datasets/binarySDG_proc.csv",
                    lineterminator='\n')

df.info()
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231701 entries, 0 to 2231700
Data columns (total 5 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   id          int64 
 1   created_at  object
 2   text        object
 3   label       object
 4   cleanText   object
dtypes: int64(1), object(4)
memory usage: 85.1+ MB


Unnamed: 0,id,created_at,text,label,cleanText
0,638501711070535680,2015-09-01 00:00:45+00:00,From @UGECProject: #urban #SDGs difficult bc u...,SDG,"['urban', 'sdgs', 'difficult', 'bc', 'unlike',..."
1,638502238646878209,2015-09-01 00:02:51+00:00,Internet Speech: Is It Free or Not? http://t.c...,SDG,"['internet', 'speech', 'free', 'globalcitizen'..."
2,638502291776126977,2015-09-01 00:03:03+00:00,Applied research needed to test #SDGs in citie...,SDG,"['applied', 'research', 'needed', 'test', 'sdg..."
3,638502519027712000,2015-09-01 00:03:57+00:00,"RT @ TriplePundit: For SABMiller, the #SDGs ar...",SDG,"['rt', 'triplepundit', 'sabmiller', 'sdgs', 'o..."
4,638502554393927680,2015-09-01 00:04:06+00:00,Thanks @irishmissionun for co-facilitating #SD...,SDG,"['thanks', 'cofacilitating', 'sdgs', 'join', '..."


### Remove tags
In order to train a ML model we need to remove the inherent label found in the text (namely; the #SDGs hashtag).

In [4]:
# Remove explicit tags (remaining words from the original tag #sdg*)
def tagRemove(tweets):
    tweets = tweets.strip("']['").split("', '")
    tweets = [w for w in tweets if not re.fullmatch(r'sdgs', w)] #This regex captures the tag
    return ' '.join([w for w in tweets])

In [5]:
df['sdgFree'] = df['cleanText'].apply(lambda x: tagRemove(x))

display(df.head())

Unnamed: 0,id,created_at,text,label,cleanText,sdgFree
0,638501711070535680,2015-09-01 00:00:45+00:00,From @UGECProject: #urban #SDGs difficult bc u...,SDG,"['urban', 'sdgs', 'difficult', 'bc', 'unlike',...",urban difficult bc unlike health edu etc there...
1,638502238646878209,2015-09-01 00:02:51+00:00,Internet Speech: Is It Free or Not? http://t.c...,SDG,"['internet', 'speech', 'free', 'globalcitizen'...",internet speech free globalcitizen education
2,638502291776126977,2015-09-01 00:03:03+00:00,Applied research needed to test #SDGs in citie...,SDG,"['applied', 'research', 'needed', 'test', 'sdg...",applied research needed test city bc standard ...
3,638502519027712000,2015-09-01 00:03:57+00:00,"RT @ TriplePundit: For SABMiller, the #SDGs ar...",SDG,"['rt', 'triplepundit', 'sabmiller', 'sdgs', 'o...",rt triplepundit sabmiller opportunity expand s...
4,638502554393927680,2015-09-01 00:04:06+00:00,Thanks @irishmissionun for co-facilitating #SD...,SDG,"['thanks', 'cofacilitating', 'sdgs', 'join', '...",thanks cofacilitating join u globalcitizen fes...


### Vectorize

In [6]:
# Computer doesn't understand text so we need to convert words to tf-idf values
X_text = df.sdgFree.values
vectorizer = TfidfVectorizer()
vectorizer.fit(X_text)
X_vectorized = vectorizer.transform(X_text)

In [7]:
# Perform train and test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized,
                                                    df['label'],
                                                    test_size = 0.20,
                                                    shuffle = True)

### Train and test:

In [8]:
SVM   = SGDClassifier()

SVM.fit(X_train, y_train)

pred_train_SVM = SVM.predict(X_train)
pred_test_SVM = SVM.predict(X_test)

In [9]:
print('SVM:')
training_data_accuracy = accuracy_score(pred_train_SVM, y_train)
print('Accuracy score on the training data: ',training_data_accuracy)
# Accuracy Score on Test Data
test_data_accuracy = accuracy_score(pred_test_SVM, y_test)
print('Accuracy score on the test data: ',test_data_accuracy)

SVM:
Accuracy score on the training data:  0.935797822287942
Accuracy score on the test data:  0.9345903692468315


In [10]:
# pickle the model
joblib.dump(SVM, "drive/My Drive/TFG/biSVM.pkl")
joblib.dump(vectorizer, "drive/My Drive/TFG/biSVM_vectorizer.pkl")

['drive/My Drive/TFG/biSVM_vectorizer.pkl']