# Tweets preprocessing

In [1]:
# Data preprocessing for Crisis dataset v1.0

# This notebook is created in Google Colab, please change the paths to your file.
# Every preprocessing method is seperate. You can choose the ones that you need. 
# Write me a note if something goes wrong or you need some new preprocessing methods.

# Enjoy!

In [2]:
!pip install contractions



In [3]:
import pandas as pd
import unicodedata
import re
import contractions

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
path = '/content/earthquakes_eyewitness_crowdflower_2000.tsv'
tweets_df=pd.read_csv(path, sep="\t")

tweets_df = tweets_df[['_unit_id', 'text']]
tweets_df.rename(columns={'text':'Tweets'},inplace=True)

In [5]:
# Check this one before running
tweets_df

Unnamed: 0,_unit_id,Tweets
0,1846692712,"TheReformedCrow Nah, I'm gonna go with earthq..."
1,1846692769,I think we just had an earthquake
2,1846692882,Uhh who else felt that earthquake tho
3,1846694004,Bay area just had a nice size earthquake
4,1846693321,Thought my dad was farting...turns out it was ...
...,...,...
1995,1846692449,5/13/18 10pm earthquake update dutchsinse http...
1996,1846692498,M5.1 #earthquake (#sГ©isme) strikes 59 km E of...
1997,1846694359,"4.5 earthquake, eastern Honshu, Japan. 2018-05..."
1998,1846692571,Magnitude 5.8 earthquake 80km N of Visokoi Isl...


In [6]:
# Make sure you run this one before other methods!

def to_lowercase(text):
    return text.lower()

#testing the function on a single sample for explaination
print(to_lowercase('IN CHINESE WE CALL CAPITALIZATION AS BIG WRITTING, IN GERMAN AS WELL.'))

#converting every row of the column into lower case 
tweets_df.Tweets=tweets_df.Tweets.apply(to_lowercase)

in chinese we call capitalization as big writting, in german as well.


In [7]:
def standardize_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

#testing the function on a single sample for explaination
print(standardize_accented_chars('sómě words such as résumé, café, prótest, divorcé, coördinate, exposé, latté.'))
#standardizing accented characters for every row
tweets_df.Tweets=tweets_df.Tweets.apply(standardize_accented_chars)

some words such as resume, cafe, protest, divorce, coordinate, expose, latte.


In [8]:
# Not a method, just to check how many tweets contain urls

def get_number_of_urls(documents):
    print("{:.2f}% of documents contain urls".format(sum
(documents.apply(lambda x:x.find('http'))>0)/len
(documents)*100))

# Passing the 'Tweets' column of the dataframe as the argument
print(get_number_of_urls(tweets_df.Tweets)) 

14.20% of documents contain urls
None


In [9]:
def remove_url(text):
    return re.sub(r'https?:\S*', '', text)

#testing the function on a single sample for explaination
print(remove_url('using https://www.google.com/ as an example'))

#removing urls from every row
tweets_df.Tweets=tweets_df.Tweets.apply(remove_url)

using  as an example


In [10]:
def expand_contractions(text):
    expanded_words = [] 
    for word in text.split():
       expanded_words.append(contractions.fix(word)) 
    return ' '.join(expanded_words)

#testing the function on a single sample for explaination
print(expand_contractions("Don't is the same as do not"))

#expanding contractions for every row
tweets_df.Tweets=tweets_df.Tweets.apply(expand_contractions)

Do not is the same as do not


In [11]:
def remove_mentions_and_tags(text):
    text = re.sub(r'@\S*', '', text)
    return re.sub(r'#\S*', '', text)

#testing the function on a single sample for explaination
print(remove_mentions_and_tags('Some random @abc and #def'))

#removing mentions and tags from every row
tweets_df.Tweets=tweets_df.Tweets.apply(remove_mentions_and_tags)

Some random  and 


In [12]:
def keep_only_alphabet(text):
    return re.sub(r'[^a-zA-Z]', ' ', text)

#testing the function on a single sample for explaination
print(keep_only_alphabet('Just a bit more $$processing required.Just a bit!!!'))

#for all the rows
tweets_df.Tweets=tweets_df.Tweets.apply(keep_only_alphabet)

Just a bit more   processing required Just a bit   


In [13]:
def remove_stop_words(text):
  """
  Returns text without stop words
  """
  text = word_tokenize(text)
  word_list = []
  for word in text:
      if word not in stopwords.words('english'):
          word_list.append(word)

  return ' '.join(word_list)


#testing the function on a single sample for explaination
print(remove_stop_words('Test this text to see which are stop words.'))

#removing stop-words and short words from every row
tweets_df.Tweets=tweets_df.Tweets.apply(remove_stop_words)

Test text see stop words .


In [14]:
def lemmatize(text):
  lemmatizer = WordNetLemmatizer()
  text_str = word_tokenize(text)
  new_words = []

  for word in text_str:
    new_words.append(lemmatizer.lemmatize(word))
  return ' '.join(new_words)

#testing the function on a single sample for explaination
print(lemmatize('apples, bananas and pears are common fruits that are eaten by humans.'))

#Performing lemmatization on every row
tweets_df.Tweets=tweets_df.Tweets.apply(lemmatize)

apple , banana and pear are common fruit that are eaten by human .


In [15]:
# Check this one after running
tweets_df

Unnamed: 0,_unit_id,Tweets
0,1846692712,thereformedcrow nah going go earthquake
1,1846692769,think earthquake
2,1846692882,uhh else felt earthquake though
3,1846694004,bay area nice size earthquake
4,1846693321,thought dad farting turn earthquake
...,...,...
1995,1846692449,pm earthquake update dutchsinse via youtube
1996,1846692498,strike km e min ago effect reported eyewitness
1997,1846694359,earthquake eastern honshu japan utc epicenter ...
1998,1846692571,magnitude earthquake km n visokoi island south...
