In [1]:
import re
import sys
from utils import write_status
from nltk.stem.porter import PorterStemmer
import pandas as pd
import nltk
from nltk import FreqDist
import pickle
import sys
from utils import write_status
from collections import Counter

In [2]:
def preprocess_word(word):
    word = word.strip('\'"?!,.():;')
    word = re.sub(r'(.)\1+', r'\1\1', word)
    word = re.sub(r'(-|\')', '', word)
    return word

In [3]:
def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

In [4]:
urlfil="F:/dataset/US airline-data/"

In [5]:
def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

In [6]:
def preprocess_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()

    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                word = str(porter_stemmer.stem(word))
            processed_tweet.append(word)

    return ' '.join(processed_tweet)


In [7]:
def preprocess_csv(csv_file_name, processed_file_name, test_file=False):
    save_to_file = open(processed_file_name, 'w')
    with open(csv_file_name, 'r', encoding = "ISO-8859-1") as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                positive = int(line[:line.find(',')])
            line = line[1 + line.find(','):]
            tweet = line
            processed_tweet = preprocess_tweet(tweet)
            if not test_file:
                save_to_file.write('%s,%d,%s\n' %
                                   (tweet_id, positive, processed_tweet))
            else:
                save_to_file.write('%s,%s\n' %
                                   (tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print ('\nSaved processed tweets to: %s' % processed_file_name)
    return processed_file_name

In [8]:
if len(sys.argv) != 2:
    print(sys.argv[0])
use_stemmer = False
csv_file_name = urlfil+"dataset.csv"
processed_file_name = urlfil + 'processed_data.csv'
print(processed_file_name)
preprocess_csv(csv_file_name, processed_file_name, test_file=False)

C:\Users\kamyab\anaconda3\lib\site-packages\ipykernel_launcher.py
F:/dataset/US airline-data/processed_data.csv
Processing 2978/14640

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Processing 7449/14640

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Processing 11982/14640

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Processing 14640/14640
Saved processed tweets to: F:/dataset/US airline-data/processed_data.csv


'F:/dataset/US airline-data/processed_data.csv'

In [9]:
def analyze_tweet(tweet):
    result = {}
    result['MENTIONS'] = tweet.count('USER_MENTION')
    result['URLS'] = tweet.count('URL')
    result['POS_EMOS'] = tweet.count('EMO_POS')
    result['NEG_EMOS'] = tweet.count('EMO_NEG')
    tweet = tweet.replace('USER_MENTION', '').replace(
        'URL', '')
    words = tweet.split()
    result['WORDS'] = len(words)
    bigrams = get_bigrams(words)
    result['BIGRAMS'] = len(bigrams)
    return result, words, bigrams

In [10]:
def get_bigrams(tweet_words):
    bigrams = []
    num_words = len(tweet_words)
    for i in range(num_words - 1):
        bigrams.append((tweet_words[i], tweet_words[i + 1]))
    return bigrams

In [11]:
def get_bigram_freqdist(bigrams):
    freq_dict = {}
    for bigram in bigrams:
        if freq_dict.get(bigram):
            freq_dict[bigram] += 1
        else:
            freq_dict[bigram] = 1
    counter = Counter(freq_dict)
    return counter

In [13]:
num_tweets, num_pos_tweets, num_neg_tweets = 0, 0, 0
num_mentions, max_mentions = 0, 0
num_emojis, num_pos_emojis, num_neg_emojis, max_emojis = 0, 0, 0, 0
num_urls, max_urls = 0, 0
num_words, num_unique_words, min_words, max_words = 0, 0, 1e6, 0
num_bigrams, num_unique_bigrams = 0, 0
all_words = []
all_bigrams = []
with open(urlfil+'processed_data.csv', 'r') as csv:
    lines = csv.readlines()
    num_tweets = len(lines)
    for i, line in enumerate(lines):
        t_id, if_pos, tweet = line.strip().split(',')
        if_pos = int(if_pos)
        if if_pos:
            num_pos_tweets += 1
        else:
            num_neg_tweets += 1
        result, words, bigrams = analyze_tweet(tweet)
        num_mentions += result['MENTIONS']
        max_mentions = max(max_mentions, result['MENTIONS'])
        num_pos_emojis += result['POS_EMOS']
        num_neg_emojis += result['NEG_EMOS']
        max_emojis = max(
            max_emojis, result['POS_EMOS'] + result['NEG_EMOS'])
        num_urls += result['URLS']
        max_urls = max(max_urls, result['URLS'])
        num_words += result['WORDS']
        min_words = min(min_words, result['WORDS'])
        max_words = max(max_words, result['WORDS'])
        all_words.extend(words)
        num_bigrams += result['BIGRAMS']
        all_bigrams.extend(bigrams)
        write_status(i + 1, num_tweets)
num_emojis = num_pos_emojis + num_neg_emojis
unique_words = list(set(all_words))
with open(urlfil+'processed-unique.txt', 'w') as uwf:
    uwf.write('\n'.join(unique_words))
num_unique_words = len(unique_words)
num_unique_bigrams = len(set(all_bigrams))
print ('\nCalculating frequency distribution')
# Unigrams
freq_dist = FreqDist(all_words)
pkl_file_name = urlfil+'processed-freqdist.pkl'
with open(pkl_file_name, 'wb') as pkl_file:
    pickle.dump(freq_dist, pkl_file)
print ('Saved uni-frequency distribution to %s' % pkl_file_name)
# Bigrams
bigram_freq_dist = get_bigram_freqdist(all_bigrams)
bi_pkl_file_name = urlfil+'processed-freqdist-bi.pkl'
with open(bi_pkl_file_name, 'wb') as pkl_file:
    pickle.dump(bigram_freq_dist, pkl_file)
print ('Saved bi-frequency distribution to %s' % bi_pkl_file_name)
print ('\n[Analysis Statistics]')
print ('Tweets => Total: %d, Positive: %d, Negative: %d' % (num_tweets, num_pos_tweets, num_neg_tweets))
print ('User Mentions => Total: %d, Avg: %.4f, Max: %d' % (num_mentions, num_mentions / float(num_tweets), max_mentions))
print ('URLs => Total: %d, Avg: %.4f, Max: %d' % (num_urls, num_urls / float(num_tweets), max_urls))
print ('Emojis => Total: %d, Positive: %d, Negative: %d, Avg: %.4f, Max: %d' % (num_emojis, num_pos_emojis, num_neg_emojis, num_emojis / float(num_tweets), max_emojis))
print ('Words => Total: %d, Unique: %d, Avg: %.4f, Max: %d, Min: %d' % (num_words, num_unique_words, num_words / float(num_tweets), max_words, min_words))
print ('Bigrams => Total: %d, Unique: %d, Avg: %.4f' % (num_bigrams, num_unique_bigrams, num_bigrams / float(num_tweets)))

Processing 2972/14640

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Processing 7512/14640

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Processing 12087/14640

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Processing 14640/14640
Calculating frequency distribution
Saved uni-frequency distribution to F:/dataset/US airline-data/processed-freqdist.pkl
Saved bi-frequency distribution to F:/dataset/US airline-data/processed-freqdist-bi.pkl

[Analysis Statistics]
Tweets => Total: 14640, Positive: 11541, Negative: 3099
User Mentions => Total: 16318, Avg: 1.1146, Max: 6
URLs => Total: 1211, Avg: 0.0827, Max: 3
Emojis => Total: 343, Positive: 241, Negative: 102, Avg: 0.0234, Max: 2
Words => Total: 230725, Unique: 11977, Avg: 15.7599, Max: 30, Min: 0
Bigrams => Total: 216090, Unique: 89348, Avg: 14.7602
