In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.decomposition import TruncatedSVD

# Extraction of features

## 1. Definition of functions

### 1.1 Extraction of user features

In [None]:
# Extract the user features
def extract_user_features(data_origin, data_transformed):
    print('extracting user features')
    # user_verified: 1 or 0 to denote whether the user has been verified by Twitter
    # change the boolean value into 1(for true) or 0(for faulse)
    data_transformed['user_verified'] = data_origin['user_verified'].astype(int)

    # user_statuses_count: The total number of tweets (statuses) the user has published
    data_transformed['user_statuses_count'] = data_origin['user_statuses_count'].astype(int)

    # user_followers_count: The number of followers the user has
    data_transformed['user_followers_count'] = data_origin['user_followers_count'].astype(int)

    # user_friends_count: The number of friends the user has
    data_transformed['user_friends_count'] = data_origin['user_friends_count'].astype(int)

    # ratio_friends_followers: ratio of No. of friends and No. of followers
    data_transformed['ratio_friends_followers'] = data_origin["user_friends_count"] / (data_origin["user_followers_count"] + 1.0)

### 1.2 Extraction of tweet features

In [3]:
# Extract the tweet features
def extract_tweet_features(data_origin, data_transformed):
    print('extracting tweet features')
    # mention_exist: 1 or 0 to denote whether a tweet mentions other users
    data_transformed['mention_exist'] = 1 - data_origin['user_mentions'].isna().astype(int)

    # mention_count: The users that are mentioned within the tweet (e.g. "@someuser").
    data_transformed['mention_count'] = data_origin['user_mentions'].apply(lambda x: 0 if pd.isna(x) else len(x.split(',')))

    # url_exist: 1 or 0 to denote whether a tweet contains any URL
    data_transformed['url_exist'] = (1 - data_origin['urls'].isna()).astype(int)

    # url_count: The total number of URLs in a tweet
    data_transformed['url_count'] = data_origin['urls'].apply(lambda x: 0 if pd.isna(x) else len(x.split(',')))

    # hashtag_exist: 1 or 0 to denote whether a tweet contains any hashtag
    data_transformed['hashtag_exist'] = (1 - data_origin['hashtags'].isna()).astype(int)

    # hashtag_count: The total number of hashtags in a tweet
    data_transformed['hashtag_count'] = data_origin['hashtags'].apply(lambda x: 0 if pd.isna(x) else len(x.split(',')))

### 1.3 Extraction of time features

In [4]:
# Extract time features
def extract_time_features(data_origin, data_transformed):
    print('extracting time features')
    # timeseg： The time segment of a tweet {1...24} indicating when it is posted
    data_transformed['timeseg'] = data_origin['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000).hour + 1) 

    # day_of_week: The value from {1...7} to indicate a day of the week
    data_transformed['day_of_week'] = data_origin['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000).isoweekday())

    # weekend： 1 or 0 to indicate whether a tweet is posted on a weekend or not
    data_transformed['weekend'] = data_transformed['day_of_week'].apply(lambda x: 1 if (x == 6 or x == 7) else 0)

### 1.4 Extraction of text features

In [5]:
# Extract text features
analyzer = SentimentIntensityAnalyzer()
def get_setiment_score(sentence):
    vs = analyzer.polarity_scores(sentence)
    return vs['pos'], vs['neg'], vs['neu'], vs['compound']

def extract_text_features(data_origin, data_transformed):
    print('extracting text features')
    # text length: The length of the text
    data_transformed['text_length'] = data_origin['text'].str.len()

    # tf_idf: Term Frequency and Inverse Term Frequency of the text
    vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
    tf_idf = vectorizer.fit_transform(data_origin['text']).toarray()
    # reduct the dimension of tf_idf result using svd
    svd = TruncatedSVD(n_components=10)
    tf_idf_reducted = svd.fit_transform(tf_idf)
    print(svd.explained_variance_ratio_)
    for i in range(tf_idf_reducted.shape[1]):
        data_transformed['tf_idf_'+str(i)] = tf_idf_reducted[:,i]

    # sentiment_pos, sentiment_neg, sentiment_neu, sentiment_comp: Scores for positive, negative, neutral, compound sentiments for a tweet
    data_transformed['sentiment_pos'], data_transformed['sentiment_neg'], data_transformed['sentiment_neu'], data_transformed['sentiment_comp'] = zip(*data_origin['text'].apply(get_setiment_score)) 


In [6]:
# Extract all features
features = ['user_verified','user_statuses_count','user_followers_count','user_friends_count','ratio_friends_followers',
           'mention_exist','mention_count', 'url_exist','url_count', 'hashtag_exist','hashtag_count',
           'timeseg', 'weekend', 'day_of_week',
           'text_length', 'sentiment_pos', 'sentiment_neg', 'sentiment_neu', 'sentiment_comp']
def extract_features(data_origin, is_train_data):
    data_transformed = pd.DataFrame(np.zeros((data_origin.shape[0], len(features))), index = data_origin.index, columns = features)
    if is_train_data:
        data_transformed['retweet_count'] = data_origin['retweet_count']
    extract_user_features(data_origin, data_transformed)
    extract_tweet_features(data_origin, data_transformed)
    extract_time_features(data_origin, data_transformed)
    extract_text_features(data_origin, data_transformed)
    data_transformed.fillna(0)
    return data_transformed

## 2. Extraction of features

In [7]:
# Extract features of evaluation data
X_evaluation = pd.read_csv("../data/evaluation.csv", header=0, index_col=0)
print('extracting features of evaluation data')
X_evaluation_transformed = extract_features(X_evaluation, False)
print('writing transformed evaluation data into evaluation_transformed.csv')
X_evaluation_transformed.to_csv('../data/evaluation_transformed.csv',index_label='id')

# Extract features of trainning data
X_train = pd.read_csv("../data/train.csv", header=0, index_col=0)
print('extracting features of trainning data')
X_train_transformed = extract_features(X_train, True)
print('writing transformed trainning data into train_transformed.csv')
X_train_transformed.to_csv('../data/train_transformed.csv',index_label='id')


extracting features of evaluation data
extracting user features
extracting tweet features
extracting time features
extracting text features
[0.07130282 0.05996944 0.04547115 0.02966212 0.02499426 0.02145453
 0.01976273 0.01828619 0.01796008 0.01559779]
writing transformed evaluation data into evaluation_transformed.csv
extracting features of trainning data
extracting user features
extracting tweet features
extracting time features
extracting text features
[0.07128467 0.06013182 0.04532428 0.02957303 0.0253767  0.02161874
 0.01971663 0.01820126 0.01771515 0.01531939]
writing transformed trainning data into train_transformed.csv
