## Merge and clean relevant SDG dataset
This file is meant to prepare the Dataset for training a binary classification ML algorithm that chooses relevant tweets (tweets related to the SDGs) or non-relevant. In order to do that we will need to label and merge both datasets. Most importantly, the text data will be preprocessed and stored.

In [16]:
import pandas as pd
import csv
import datetime

# Visualization
#import matplotlib.pyplot as plt
#import seaborn as sns

# NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [49]:
# Mount the drive, if run locally, this step can be skiped
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
# load datasets
sdg_df =pd.read_csv("drive/My Drive/TFG/Datasets/tweetsSDG_enALL.csv",
                    lineterminator='\n', usecols=['twid','authid','created_at','text'])
sdg_df.head()

Unnamed: 0,twid,authid,created_at,text
0,638501711070535680,2657143950,2015-09-01 00:00:45+00:00,From @UGECProject: #urban #SDGs difficult bc u...
1,638502238646878209,597917107,2015-09-01 00:02:51+00:00,Internet Speech: Is It Free or Not? http://t.c...
2,638502291776126977,2657143950,2015-09-01 00:03:03+00:00,Applied research needed to test #SDGs in citie...
3,638502519027712000,344831740,2015-09-01 00:03:57+00:00,"RT @ TriplePundit: For SABMiller, the #SDGs ar..."
4,638502554393927680,1578572066,2015-09-01 00:04:06+00:00,Thanks @irishmissionun for co-facilitating #SD...


In [23]:
nosdg_df =pd.read_csv("drive/My Drive/TFG/Datasets/NoRelevant_enAll.csv",
                      lineterminator='\n')
                    
nosdg_df.head()

Unnamed: 0,id,created_at,text
0,717502078546014208,2016-04-06T00:00:00.000Z,"Ayy lmao HE CAN HANDSTAND, WHEN HE HAS NO GRAC..."
1,717502078327853056,2016-04-06T00:00:00.000Z,.@RchrdAlln suggests Kingston Mills Locks. Wev...
2,717502078331985920,2016-04-06T00:00:00.000Z,@likeold2's account is temporarily unavailable...
3,717502078336180224,2016-04-06T00:00:00.000Z,"Wind 3.0 mph SSE. Barometer 30.237 in, Falling..."
4,717502078340431872,2016-04-06T00:00:00.000Z,I've completed the daily quest in Paradise Isl...


In [26]:
# Change attribute names and drop rows to match
sdg_df.rename(columns={'twid':'id'}, inplace=True)
sdg_df.drop(columns=['authid'], inplace=True)
sdg_df.info()
nosdg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482079 entries, 0 to 1482078
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   id          1482079 non-null  int64 
 1   created_at  1482079 non-null  object
 2   text        1482079 non-null  object
dtypes: int64(1), object(2)
memory usage: 33.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 843607 entries, 0 to 843606
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          843607 non-null  int64 
 1   created_at  843607 non-null  object
 2   text        843607 non-null  object
dtypes: int64(1), object(2)
memory usage: 19.3+ MB


### Clean datasets
#### Remove duplicates

In [29]:
print('Number of duplicated tweet ids:')
print(len(sdg_df[sdg_df['id'].duplicated()]))
print(len(nosdg_df[nosdg_df['id'].duplicated()]))

print('Number of duplicated text:')
print(len(sdg_df[sdg_df['text'].duplicated()]))
print(len(nosdg_df[nosdg_df['text'].duplicated()]))

Number of duplicated tweet ids:
0
0
Number of duplicated text:
87708
6277


In [30]:
# Remove text duplicates in order to have an heterogeneous training set
sdg_df.drop_duplicates(subset=['text'], inplace=True)
nosdg_df.drop_duplicates(subset=['text'], inplace=True)

### Add label attribute and merge

In [31]:
sdg_df['label'] = 'SDG'
nosdg_df['label'] = 'NO'

In [34]:
df = pd.concat([sdg_df,nosdg_df])
df.head()

Unnamed: 0,id,created_at,text,label
0,638501711070535680,2015-09-01 00:00:45+00:00,From @UGECProject: #urban #SDGs difficult bc u...,SDG
1,638502238646878209,2015-09-01 00:02:51+00:00,Internet Speech: Is It Free or Not? http://t.c...,SDG
2,638502291776126977,2015-09-01 00:03:03+00:00,Applied research needed to test #SDGs in citie...,SDG
3,638502519027712000,2015-09-01 00:03:57+00:00,"RT @ TriplePundit: For SABMiller, the #SDGs ar...",SDG
4,638502554393927680,2015-09-01 00:04:06+00:00,Thanks @irishmissionun for co-facilitating #SD...,SDG


### Text preprocessing
This section is computationally intensive

In [35]:
# Useful functions
# Tweet text preleaning
def tweet_preCleaning(tweet):
    # replace endlines with spaces
    tweet = tweet.replace('\n',' ')
    
    # Remove media links and undisired characters
    return re.sub(r"(@[A-Za-z0–9_]+)|#|http\S+|sdgs?|&\w+|[^\w\s]", '', tweet)

def stopword_removal(tweet):
    return [w for w in tweet if w not in stopwords.words('english')]

def tweet_lemmatizing (tweet): # input: list of tokenized words from a tweet
    lemmatizer = WordNetLemmatizer() # lemmatizer instantiation
    return [lemmatizer.lemmatize(word) for word in tweet]

In [36]:
# Create a column to process text for safety
df['cleanText'] = df['text']

In [37]:
# Removing html (media in twitter is converted into links too), removing endlines and unescaping html entities
df['cleanText'] = df['cleanText'].apply(lambda x: tweet_preCleaning(x))
display(df['cleanText'].head())

0    From  urban SDGs difficult bc unlike health ed...
1    Internet Speech Is It Free or Not   globalciti...
2    Applied research needed to test SDGs in cities...
3    RT  TriplePundit For SABMiller the SDGs are an...
4    Thanks  for cofacilitating SDGs   join us at t...
Name: cleanText, dtype: object

In [45]:
# Tokenization, remove stopwords and punctuation (This may take a while)

# text is lowered, then tokenized and stopwords are removed
df['cleanText'] = df['cleanText'].apply(lambda x: word_tokenize(x.lower()))

In [46]:
# Stopword removal
df['cleanText'] = df['cleanText'].apply(lambda x: stopword_removal(x))
display(df['cleanText'].head())

0    [urban, sdgs, difficult, bc, unlike, health, e...
1    [internet, speech, free, globalcitizen, educat...
2    [applied, research, needed, test, sdgs, cities...
3    [rt, triplepundit, sabmiller, sdgs, opportunit...
4    [thanks, cofacilitating, sdgs, join, us, globa...
Name: cleanText, dtype: object

In [47]:
# Lemmatizing

df['cleanText'] = df['cleanText'].apply(lambda x: tweet_lemmatizing(x))
display(df['cleanText'].head())

0    [urban, sdgs, difficult, bc, unlike, health, e...
1    [internet, speech, free, globalcitizen, educat...
2    [applied, research, needed, test, sdgs, city, ...
3    [rt, triplepundit, sabmiller, sdgs, opportunit...
4    [thanks, cofacilitating, sdgs, join, u, global...
Name: cleanText, dtype: object

### Store the dataset

In [50]:
display(df.head())
df.to_csv('drive/My Drive/TFG/Datasets/binarySDG_proc.csv', index=False)

Unnamed: 0,id,created_at,text,label,cleanText
0,638501711070535680,2015-09-01 00:00:45+00:00,From @UGECProject: #urban #SDGs difficult bc u...,SDG,"[urban, sdgs, difficult, bc, unlike, health, e..."
1,638502238646878209,2015-09-01 00:02:51+00:00,Internet Speech: Is It Free or Not? http://t.c...,SDG,"[internet, speech, free, globalcitizen, educat..."
2,638502291776126977,2015-09-01 00:03:03+00:00,Applied research needed to test #SDGs in citie...,SDG,"[applied, research, needed, test, sdgs, city, ..."
3,638502519027712000,2015-09-01 00:03:57+00:00,"RT @ TriplePundit: For SABMiller, the #SDGs ar...",SDG,"[rt, triplepundit, sabmiller, sdgs, opportunit..."
4,638502554393927680,2015-09-01 00:04:06+00:00,Thanks @irishmissionun for co-facilitating #SD...,SDG,"[thanks, cofacilitating, sdgs, join, u, global..."
