## Pre-Processing Script - Sentiment Analyzer

* Script used to process tweets and calculate project specific variables.
    * Project specific variables are currently commented out
    * Sentiment analysis and recoding is currently included

**Step By Step Instructions**
1. Adjust path to incorporate folder containing tweets you'd like to process
2. Run script - outputted file will be called "ProcessedTweets.csv"

## Importing Tweets

In [1]:
import os
import glob
import pandas as pd
import numpy as np

path = r'./SavedTweets/'
all_files = glob.glob(os.path.join(path, "*.csv"))

df_all_files = (pd.read_csv(f) for f in all_files)
df = pd.concat(df_all_files, ignore_index=True)

## Pre-Processing

In [2]:
# cleaning hashtags
regex = "(\[\d+\, \d+)|text|indices|\'|\]|\[|\:|}|text|\,|{"
df['Hashtags'] = df.Hashtags.str.replace(regex, '').str.replace('    ', ', ')

In [3]:
# coding date/time
df['date'] = pd.to_datetime(df.date, infer_datetime_format=True)
df['r_date'] = df.date.dt.date
df['weekday'] = df.date.dt.weekday_name
df['hour'] = df.date.dt.hour

In [4]:
'''# coding week of class
df.loc[((df['date'] > '2018-8-28') & (df['date'] < '2018-9-4')), 'week'] = 'Week 1'
df.loc[((df['date'] > '2018-9-4') & (df['date'] < '2018-9-11')), 'week'] = 'Week 2'
df.loc[((df['date'] > '2018-9-11') & (df['date'] < '2018-9-18')),'week'] = 'Week 3'
df.loc[((df['date'] > '2018-9-18') & (df['date'] < '2018-9-25')), 'week'] = 'Week 4'
df.loc[((df['date'] > '2018-9-25') & (df['date'] < '2018-10-2')), 'week'] = 'Week 5'
df.loc[((df['date'] > '2018-10-2') & (df['date'] < '2018-10-9')), 'week'] = 'Week 6'
df.loc[((df['date'] > '2018-10-9') & (df['date'] < '2018-10-16')),'week'] = 'Week 7'
df.loc[((df['date'] > '2018-10-16') & (df['date'] < '2018-10-23')), 'week'] = 'Week 8'
df.loc[((df['date'] > '2018-10-23') & (df['date'] < '2018-10-30')), 'week'] = 'Week 9'
df.loc[((df['date'] > '2018-10-30') & (df['date'] < '2018-11-6')), 'week'] = 'Week 10'
df.loc[((df['date'] > '2018-11-6') & (df['date'] < '2018-11-13')), 'week'] = 'Week 11'
df.loc[((df['date'] > '2018-11-13') & (df['date'] < '2018-11-20')), 'week'] = 'Week 12'
df.loc[((df['date'] > '2018-11-20') & (df['date'] < '2018-11-27')), 'week'] = 'Week 13'
df.loc[((df['date'] > '2018-11-27') & (df['date'] < '2018-12-4')), 'week'] = 'Week 14'
df.loc[((df['date'] > '2018-12-4') & (df['date'] < '2018-12-13')), 'week'] = 'Week 15'
df.loc[((df['date'] > '2018-12-13') & (df['date'] < '2018-12-20')), 'week'] = 'Week 16'

# week - subject
subject_val = {'Week 1':'Introduction-SocialMedia+Journalism','Week 2':'Social Media Best Practices/Electronic Communication Theories/Social Media Ethics','Week 3':'SEO/Analytics and Hashtags','Week 4':'Light Side/DarkSide','Week 5':'Fake News/Business Advertising','Week 6':'Prepare Case Studies','Week 7':'GuestLecture/Case Study Homework','Week 8':'Case Study Presentation/Rise of Influencers','Week 9':'Nonprofits/Social Movements','Week 10':'Social Media and Politics','Week 11':'Multimedia Storytelling','Week 12':'Personal Privacy on Social Media','Week 13':'Thanksgiving Break','Week 14':'Careers in Social Media','Week 15':'Social Media Campaign Presentatation','Week 16':'Final Exam'}
df['subject'] = df.week
df.subject.replace(subject_val, inplace=True)'''

"# coding week of class\ndf.loc[((df['date'] > '2018-8-28') & (df['date'] < '2018-9-4')), 'week'] = 'Week 1'\ndf.loc[((df['date'] > '2018-9-4') & (df['date'] < '2018-9-11')), 'week'] = 'Week 2'\ndf.loc[((df['date'] > '2018-9-11') & (df['date'] < '2018-9-18')),'week'] = 'Week 3'\ndf.loc[((df['date'] > '2018-9-18') & (df['date'] < '2018-9-25')), 'week'] = 'Week 4'\ndf.loc[((df['date'] > '2018-9-25') & (df['date'] < '2018-10-2')), 'week'] = 'Week 5'\ndf.loc[((df['date'] > '2018-10-2') & (df['date'] < '2018-10-9')), 'week'] = 'Week 6'\ndf.loc[((df['date'] > '2018-10-9') & (df['date'] < '2018-10-16')),'week'] = 'Week 7'\ndf.loc[((df['date'] > '2018-10-16') & (df['date'] < '2018-10-23')), 'week'] = 'Week 8'\ndf.loc[((df['date'] > '2018-10-23') & (df['date'] < '2018-10-30')), 'week'] = 'Week 9'\ndf.loc[((df['date'] > '2018-10-30') & (df['date'] < '2018-11-6')), 'week'] = 'Week 10'\ndf.loc[((df['date'] > '2018-11-6') & (df['date'] < '2018-11-13')), 'week'] = 'Week 11'\ndf.loc[((df['date'] > '2

## Sentiment Analysis

In [5]:
# Vader Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

data_OE = df.loc[:, ['user', 'text']]
data_OE.fillna('0', inplace=True)

columns = []
for i in data_OE.columns:
    if i != 'user':
        columns.append(i+'_cmp')
        columns.append('pos_%s'%i)
        columns.append('neg_%s'%i)
        columns.append('neu_%s'%i)

i = 0
df_dict = {}
for i in np.arange(len(columns)):
    for j in columns:
        df_dict[j]=[]

df_empty = pd.DataFrame(df_dict)
df_empty = df_empty[columns]

compound_list = []
positive_list = []
negative_list = []
neutral_list = []


i = 0;
for i in range(0,len(list(data_OE))-1):
    for text in data_OE.iloc[:,i+1]:

        # Run Vader Analysis on each tweet
        compound = analyzer.polarity_scores(text)["compound"]
        pos = analyzer.polarity_scores(text)["pos"]
        neu = analyzer.polarity_scores(text)["neu"]
        neg = analyzer.polarity_scores(text)["neg"]

        # Add each value to the appropriate array
        compound_list.append(compound)
        positive_list.append(pos)
        negative_list.append(neg)
        neutral_list.append(neu)

    # print(compound_list)
    j = (i*4);
    k = (i*4)+1;
    l = (i*4)+2;
    m = (i*4)+3;
    df_empty.iloc[:,j] = compound_list
    df_empty.iloc[:,k] = positive_list
    df_empty.iloc[:,l] = negative_list
    df_empty.iloc[:,m] = neutral_list
    compound_list = []
    positive_list = []
    negative_list = []
    neutral_list = []


# merges sentiment results into dataframe
df = pd.merge(df,df_empty, left_index=True, right_index=True)

## Regrouping Variables

In [7]:
user_group = df.groupby('user')
df['Sentiment_Mean'] = user_group.text_cmp.transform('mean')

'''week_group = df.groupby(['user', 'week'])
df['Week_sent_Mean'] = week_group.text_cmp.transform('mean')

# tweet counting
df['Tweet_Number'] = df.groupby(['user']).cumcount()
df['Tweet_Total'] = df.groupby('user').week.transform('count')

df.loc[(df.Tweet_Total > 18), 'Post_Frequency'] = 'Frequent'
df.loc[(df.Tweet_Total >= 14) & (df.Tweet_Total <= 18), 'Post_Frequency'] = 'Normal'
df.loc[(df.Tweet_Total <= 13) & (df.Tweet_Total >= 6), 'Post_Frequency'] = 'Infrequent'
df.loc[(df.Tweet_Total < 6), 'Post_Frequency'] = 'Very Infrequent'
'''

# Tweet Sentiment
df.loc[(df.text_cmp >= .05), 'Tweet_Bucket'] = 'Positive'
df.loc[(df.text_cmp < .05) & (df.text_cmp > -.05), 'Tweet_Bucket'] = 'Neutral'
df.loc[(df.text_cmp <= -.05), 'Tweet_Bucket'] = 'Negative'


'''subject_val = {'Week 1':1,'Week 2':2,'Week 3':3,'Week 4':4,
               'Week 5':5,'Week 6':6,'Week 7':7,'Week 8':8,
               'Week 9':9,'Week 10':10,'Week 11':11, 'Week 12':12,
               'Week 13':13,'Week 14':14,'Week 15':15,'Week 16':16}

df['week_num'] = df.week
df['week_num'].replace(subject_val, inplace=True)
'''

# User Sentiment
df.loc[(df.Sentiment_Mean >= .05), 'UserSent_Bucket'] = 'Positive'
df.loc[(df.Sentiment_Mean < .05) & (df.Sentiment_Mean > -.05), 'UserSent_Bucket'] = 'Neutral'
df.loc[(df.Sentiment_Mean <= -.05), 'UserSent_Bucket'] = 'Negative'
df.UserSent_Bucket.fillna(value='Neutral', inplace=True)

## Exporting

In [8]:
df.to_csv('ProcessedTweets.csv', index=False)