In [1]:
import pandas as pd
tweets_df = pd.read_csv('tweets.csv', delimiter=';', dtype='unicode', usecols=['user', 'timestamp', 'replies', 'likes', 'retweets', 'text'])

In [2]:
# tweets_df = tweets_df[['user', 'timestamp', 'replies', 'likes', 'retweets', 'text']]
tweets_df.columns

Index(['user', 'timestamp', 'replies', 'likes', 'retweets', 'text'], dtype='object')

## Preprocessing Tweets

* Understand timestamps, remove/extract url info, select english tweets, remove @, # etc

In [3]:
print(tweets_df.shape)
# Removing datapoints where tweets are not present
tweets_df = tweets_df.dropna(subset=['text'])

# Removing duplicate tweets by same user
tweets_df = tweets_df.drop_duplicates(subset=['text', 'user'])

# Removing UTC offest from timestamp
tweets_df['timestamp'] = pd.to_datetime(tweets_df.timestamp).dt.tz_convert(None)
print(tweets_df.shape)

(20165013, 6)
(15638471, 6)


### Stratifying the timestamp

In [32]:
# sample = tweets_df.sample(n=20000)

In [4]:
# Binning/Stratifying the timestamps into hours or days.
tweets_df['day_interval'] = tweets_df.timestamp.dt.floor('D')
tweets_df['hour_interval'] = tweets_df.timestamp.dt.floor('H')

#### FIlter Daterange between 2012-01-01 00:00:00 and 2019-10-31 23:59:59

In [5]:
tweets_df = tweets_df[((tweets_df.timestamp >= '2016-01-01 00:00:00') & (tweets_df.timestamp <= '2018-12-31 23:59:59'))]

In [7]:
#Remove tweets that have less than 5 words
tweets_df =tweets_df[tweets_df.text.str.split().apply(len)>=5]

### Select ENglish Tweets

In [8]:
# Remove non-English tweets
import langdetect
langdetect.DetectorFactory.seed = 0
def english_detector(text):
    try:
        return langdetect.detect(text) == 'en'
    except:
        return False

tweets_df = tweets_df[tweets_df.text.apply(english_detector)]

In [9]:
import requests
from bs4 import BeautifulSoup
import re
def extract_url_title(url):
    try:
        reqs = requests.get(url)
        soup = BeautifulSoup(reqs.text, 'html.parser')
        return soup.title.get_text()
    except:
        return None
def remove_url(tweet):
    url_regex = '(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    return re.sub(url_regex, '', tweet)
tweets_df['text'] = tweets_df.text.apply(remove_url)

In [10]:
import nltk
from nltk.corpus import words
nltk.download('wordnet')
word_set = set(words.words())
count_u, count_h = 0, 0
def preprocess_tweet(tweet):
    tweet = remove_url(tweet)
    global word_set
    word_list = tweet.split(' ')
    for id, word in enumerate(word_list):
        if word.startswith('@'):
            word_list[id] = 'USER' # Replacing hashtags with a keyword USER
        if word.startswith('#'):
            word_list[id] = word[1:] if word in word_set else '' # Removing hashtag text if its not a known english word. Also removing the # symbol
    return ' '.join(word_list)

    
tweets_df['text'] = tweets_df.text.apply(preprocess_tweet)
tweets_df = tweets_df.dropna(subset=['text']).reset_index(drop=True)

[nltk_data] Downloading package wordnet to /home/josepham/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Vader Sentiment

In [11]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def apply_vader(tweet): 
    polarity_scores = analyzer.polarity_scores(tweet)
    return pd.Series([polarity_scores['pos'], polarity_scores['neg'], polarity_scores['compound']])

tweets_df[['positive_polarity', 'negative_polarity', 'compound']] = tweets_df.text.apply(apply_vader)

In [12]:
tweets_df.to_csv('tweets_dataset_cleaned.csv', index=False)

In [None]:
tweets_df.shape