## Preprocessing 

In this notebook all the preprocessing phase is computed. The "***cleaned***" version of the datasets are created and saved to new csv files. 


In [None]:
# google drive settings 
from google.colab import drive 
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
# google drive settings 
%%capture
%cd /content/gdrive/My\ Drive/NLP

In [None]:
%%capture 
!python3 -m spacy download it_core_news_sm

In [None]:
import pandas as pd
import csv
import re
import spacy
import it_core_news_sm
nlp = it_core_news_sm.load()

In [None]:
def clean_tweet(text):   
  #remove urls 
  urls_pattern = 'http\S+'
  clean_version = re.sub(urls_pattern, ' ', text)

  #replace hashtag symbol with whitespace
  hashtag_pattern = '#(\w+)'
  clean_version = re.sub(hashtag_pattern, ' ', clean_version)

  #replace - with whitespace
  clean_version = clean_version.replace('-', ' ')

  clean_version = clean_version.replace('_', ' ')

  #replace \ with whitespace
  clean_version = clean_version.replace('\'' , ' ')

  #remove \n 
  clean_version = clean_version.replace('\n', ' ')

  #remove emoji 
  emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
  clean_version = emoji_pattern.sub(' ',clean_version)

  #remove numbers 
  numbers_pattern = '\d+'
  clean_version = re.sub(numbers_pattern, ' ', clean_version)

  #remove punctuation (remove also @)
  punt_pattern = '[^\w\s]'
  clean_version = re.sub(punt_pattern,' ',clean_version)

  # remove multiple whitespace 
  clean_version = re.sub(' +', ' ', clean_version )

  return clean_version.lower()


In [None]:
# retrieve italian stopwords from file 
stop_word_file = open("tweet_data/italian_stop_words.txt", "r")
stopwords_italian = stop_word_file.read().split('\n')
set_word = set(stopwords_italian) #transform it to set to allow faster evaluation

def lemmatize_text(text, stopword_list = set_word): 
  final_words = [] 
  lemmas = [token.lemma_ for token in nlp(text) 
              if token.pos_ in {'ADJ', 'ADV', 'NOUN', 'NUMERAL', 'NUM', 'PROPN','VERB'}]
  for word in lemmas:  
    if word not in stopword_list: 
      final_words.append(word)
  return (" ".join(final_words))

In [None]:
# find the hashtag given a tweet 
def find_hashtag(tweet):
  hashtag_pattern = '#(\w+)'
  return re.findall(hashtag_pattern, tweet)

In [None]:
# first week dataset --  Sunday 23 Feb - Sunday 1 March
feb23 = pd.read_csv("tweet_data/Feb23.csv")
feb24 = pd.read_csv("tweet_data/Feb24.csv")
feb25 = pd.read_csv("tweet_data/Feb25.csv")
feb26 = pd.read_csv("tweet_data/Feb26.csv")
feb27 = pd.read_csv("tweet_data/Feb27.csv")
feb28 = pd.read_csv("tweet_data/Feb28.csv")
feb29 = pd.read_csv("tweet_data/Feb29.csv")
march1 = pd.read_csv("tweet_data/March01.csv")
first_week = pd.concat([feb23, feb24, feb25, feb26, feb27, feb28, feb29, march1], ignore_index=True)

In [None]:
%%time
first_week['clean_tweet'] =  first_week['text'].apply(clean_tweet)

CPU times: user 4.31 s, sys: 34.5 ms, total: 4.34 s
Wall time: 4.35 s


In [None]:
%%time
first_week['clean_text'] = first_week['clean_tweet'].apply(lemmatize_text)

CPU times: user 29min 2s, sys: 3.28 s, total: 29min 5s
Wall time: 29min 6s


In [None]:
first_week['hashtag'] = first_week['text'].apply(find_hashtag)

In [None]:
columms = ['text', 'clean_text', 'clean_tweet','hashtag', 'date', 'year', 'month', 'day', 'hour']
first_week[columms].to_csv('tweet_data/first_week_new.csv', index=False, quoting=csv.QUOTE_ALL)

Second week -- Sunday 15 March - Sunday 22 March


In [None]:
mar15 = pd.read_csv("tweet_data/March15.csv")
mar16 = pd.read_csv("tweet_data/March16.csv")
mar17 = pd.read_csv("tweet_data/March17.csv")
mar18 = pd.read_csv("tweet_data/March18.csv")
mar19 = pd.read_csv("tweet_data/March19.csv")
mar20 = pd.read_csv("tweet_data/March20.csv")
mar21 = pd.read_csv("tweet_data/March21.csv")
mar22 = pd.read_csv("tweet_data/March22.csv")
second_week = pd.concat([mar15, mar16, mar17, mar18, mar19, mar20, mar21, mar22], ignore_index=True)

In [None]:
second_week['clean_tweet'] =  second_week['text'].apply(clean_tweet)

In [None]:
second_week['clean_text'] = second_week['clean_tweet'].apply(lemmatize_text)

In [None]:
second_week['hashtag'] = second_week['text'].apply(find_hashtag)

In [None]:
columms = ['text', 'clean_text', 'clean_tweet','hashtag', 'date', 'year', 'month', 'day', 'hour']
second_week[columms].to_csv('tweet_data/second_week_new.csv', index=False,  quoting=csv.QUOTE_ALL)

Third week -- Sunday 19 April - Sunday 26 April


In [None]:
# third week dataset -- Sunday 19 April - Sunday 26 April 
apr19 = pd.read_csv("tweet_data/April19.csv")
apr20 = pd.read_csv("tweet_data/April20.csv")
apr21 = pd.read_csv("tweet_data/April21.csv")
apr22 = pd.read_csv("tweet_data/April22.csv")
apr23 = pd.read_csv("tweet_data/April23.csv")
apr24 = pd.read_csv("tweet_data/April24.csv")
apr25 = pd.read_csv("tweet_data/April25.csv")
apr26 = pd.read_csv("tweet_data/April26.csv")
third_week = pd.concat([apr19, apr20, apr21, apr22, apr23, apr24, apr25, apr26], ignore_index=True)

In [None]:
third_week['clean_tweet'] =  third_week['text'].apply(clean_tweet)

In [None]:
%%time
third_week['clean_text'] = third_week['clean_tweet'].apply(lemmatize_text)

CPU times: user 27min 54s, sys: 2.91 s, total: 27min 57s
Wall time: 27min 58s


In [None]:
third_week['hashtag'] = third_week['text'].apply(find_hashtag)

In [None]:
columms = ['text', 'clean_text', 'clean_tweet','hashtag', 'date', 'year', 'month', 'day', 'hour']
third_week[columms].to_csv('tweet_data/third_week_new.csv', index=False, quoting=csv.QUOTE_ALL)

Fourth week -- Sunday 17 May - Sunday 24 May

In [None]:
# fourth week dataset-- Sunday 17 May - Sunday 24 May 
may17 = pd.read_csv("tweet_data/May17.csv")
may18 = pd.read_csv("tweet_data/May18.csv")
may19 = pd.read_csv("tweet_data/May19.csv")
may20 = pd.read_csv("tweet_data/May20.csv")
may21 = pd.read_csv("tweet_data/May21.csv")
may22 = pd.read_csv("tweet_data/May22.csv")
may23 = pd.read_csv("tweet_data/May23.csv")
may24 = pd.read_csv("tweet_data/May24.csv")
fourth_week = pd.concat([may17, may18, may19, may20, may21, may22, may23, may24], ignore_index=True)

In [None]:
%%time
fourth_week['clean_tweet'] =  fourth_week['text'].apply(clean_tweet)

CPU times: user 2.03 s, sys: 18 ms, total: 2.05 s
Wall time: 2.06 s


In [None]:
%%time
fourth_week['clean_text'] = fourth_week['clean_tweet'].apply(lemmatize_text)

CPU times: user 14min 8s, sys: 1.73 s, total: 14min 10s
Wall time: 14min 11s


In [None]:
fourth_week['hashtag'] = fourth_week['text'].apply(find_hashtag)

In [None]:
columms = ['text', 'clean_text', 'clean_tweet','hashtag', 'date', 'year', 'month', 'day', 'hour']
fourth_week[columms].to_csv('tweet_data/fourth_week_new.csv', index=False,  quoting=csv.QUOTE_ALL)