# Data Prepration and Feature Extraction

- Extracting Reliabilty of each User tweet using its likes and retweets
- Extracting Reputation of each User using its fallowers count
- Extracting Sentiment of each User tweet using its polarity

In [1]:
import pandas as pd
import openpyxl
import xlrd

from pandas import ExcelFile

## Import Data

In [2]:
raw_dataset = pd.read_excel("../../resources/dataset/tweets.xlsx")

In [3]:
print "shape:", raw_dataset.shape
# raw_dataset = raw_dataset.sort(['user_screen_name'], ascending=[1])
# print raw_dataset[['retweet_count', 'favorite_count', 'user_screen_name', 'user_followers_count']].head(1000)

shape: (1264, 18)


## User Data Frame

- First build map of user to its features then build the data frame

In [4]:
user_features = {}
for idx, row in raw_dataset.iterrows():
    username = row.user_screen_name.strip()
    
    if username not in user_features:
        user_features[username] = {}
        user_features[username]['likes'] = row.favorite_count
        user_features[username]['retweets'] = row.retweet_count
        user_features[username]['followers'] = row.user_followers_count
        user_features[username]['related_tweets'] = row.related_tweets
    else:
        user_features[username]['likes'] += row.favorite_count
        user_features[username]['retweets'] += row.retweet_count
        user_features[username]['followers'] = row.user_followers_count
        user_features[username]['related_tweets'] = row.related_tweets

In [5]:
clean_dataset = pd.DataFrame([i for i in range(0, len(user_features))], columns=["id"])

In [6]:
usernames_column = []
likes_column = []
retweets_column = []
followers_column = []
related_tweets_column = []

for username in user_features:
    features = user_features[username]
    
    usernames_column.append(username)
    likes_column.append(features['likes'])
    retweets_column.append(features['retweets'])
    followers_column.append(features['followers'])
    related_tweets_column.append(features['related_tweets'])
    # print username, features['likes'], features['retweets'], features['followers'], features['related_tweets'][:40]

clean_dataset['username'] = usernames_column
clean_dataset['likes'] = likes_column
clean_dataset['retweets'] = retweets_column
clean_dataset['followers'] = followers_column
clean_dataset['related_tweets'] = related_tweets_column

In [7]:
print clean_dataset.shape
clean_dataset[clean_dataset.username.str.contains('lasinger711')]

(1143, 6)


Unnamed: 0,id,username,likes,retweets,followers,related_tweets
641,641,lasinger711,2,0,102,a student has created a gripping and nsfw phot...


# Extracting Features

In [8]:
new_dataset = pd.DataFrame([id for id in clean_dataset.id], columns=["id"])

## Extracting reliabilty

Using the number of likes, retweets and replays

In [9]:
def calculate_reliabilty(tweet):
    return tweet.retweets*5 + tweet.likes

In [10]:
new_dataset['reliabilty'] = [calculate_reliabilty(row) for idx, row in clean_dataset.iterrows()]

In [11]:
new_dataset.head(2)

Unnamed: 0,id,reliabilty
0,0,0
1,1,0


## Extracting popularity

Use number of followers of the author of the tweet

In [12]:
new_dataset['popularity'] = [row.followers for idx, row in clean_dataset.iterrows()]

In [13]:
new_dataset.head(2)

Unnamed: 0,id,reliabilty,popularity
0,0,0,528
1,1,0,477


# Extracting Polarity

Using the Hu and Liu (2004) "opinion lexicon" 

In [14]:
negative_words = set()
with open("../../resources/Hu and Liu (2004)/negative-words.txt") as f:
    for line in f.readlines():
        if line[0] != ';':
            negative_words.add(line.strip())

In [15]:
positive_words = set()
with open("../../resources/Hu and Liu (2004)/positive-words.txt") as f:
    for line in f.readlines():
        if line[0] != ';':
            positive_words.add(line.strip())

In [16]:
def calculate_polarity(text):
    
    polarity_score = 0
    for word in text.split():
        if word in negative_words:
            polarity_score -= 1
        elif word in positive_words:
            polarity_score += 1
    # print text.split(), len(negative_words), len(positive_words), polarity_score
    return polarity_score

In [17]:
new_dataset['polarity'] = [calculate_polarity(row.related_tweets) for idx, row in clean_dataset.iterrows()]

In [18]:
new_dataset.head(4)

Unnamed: 0,id,reliabilty,popularity,polarity
0,0,0,528,0
1,1,0,477,0
2,2,0,188,2
3,3,0,515,1


## Export Data

In [19]:
import openpyxl
from pandas import ExcelWriter

writer = ExcelWriter('../../resources/dataset/preprocessedDataset.xlsx')
new_dataset.to_excel(writer,'Sheet1')
writer.save()