In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

# Loading and merging the data

In [2]:
file1 = pd.read_csv("../data/crowdsourced.csv", encoding='utf-8')
file2 = pd.read_csv("../data/groundtruth.csv", encoding='utf-8')
df = pd.concat([file1, file2])


df["date"] = df["File_id"].str.strip(to_strip=".txt")

df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace= True)
df["mos_before_election"] = 11 - df["date"].dt.month

df['index'] = pd.RangeIndex(len(df))
df.set_index('index', inplace=True)
df


Unnamed: 0_level_0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,date,mos_before_election
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,8211,"Now, this is not standing still.",Richard M. Nixon,Vice President,REPUBLICAN,1960-09-26.txt,6,114,-0.417840,-1,1960-09-26,2
1,8515,So these are three programs which are quite mo...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,418,0.249581,-1,1960-09-26,2
2,8514,The proposal advanced by you and by Mr. Javits...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,42,417,-0.626563,1,1960-09-26,2
3,8513,It does not put a deficit on the Treasury.,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,416,-0.629486,1,1960-09-26,2
4,8512,The third is medical care for the aged which i...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,22,415,0.000000,-1,1960-09-26,2
...,...,...,...,...,...,...,...,...,...,...,...,...
23528,34028,"First of all, the media is so dishonest and so...",Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,17,907,0.032300,-1,2016-10-19,1
23529,34027,What I've seen -- what I've seen is so bad.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,9,906,-0.669600,-1,2016-10-19,1
23530,34026,I'll look at it at the time.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,905,0.000000,-1,2016-10-19,1
23531,34039,So I talk about the corrupt media.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,918,0.000000,-1,2016-10-19,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23533 entries, 0 to 23532
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Sentence_id          23533 non-null  int64         
 1   Text                 23533 non-null  object        
 2   Speaker              23533 non-null  object        
 3   Speaker_title        23533 non-null  object        
 4   Speaker_party        23533 non-null  object        
 5   File_id              23533 non-null  object        
 6   Length               23533 non-null  int64         
 7   Line_number          23533 non-null  int64         
 8   Sentiment            23530 non-null  float64       
 9   Verdict              23533 non-null  int64         
 10  date                 23533 non-null  datetime64[ns]
 11  mos_before_election  23533 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(5)
memory usage: 2.3+ MB


# Data preprocessing

In [4]:
def remove_punctuation(text):
    tokens = re.sub('[^a-zA-Z]', ' ', text).lower()
    return tokens

In [5]:
def remove_stop_words(text):
    stop_words = stopwords.words('english')
    word_list = [word for word in text.split() if word not in stop_words]
    return word_list

In [6]:
def get_word_stemm(word_list):
    """Stemmers remove morphological affixes from words, leaving only the word stem."""
    stemmer = SnowballStemmer('english')
    singles = [stemmer.stem(word) for word in word_list] 
    return singles

In [7]:
def preprocess_data(docs, stemming=False):

    text_list = [] 
    for doc in docs:  
        # 1. Remove punctuation and set as lower case
        text = remove_punctuation(doc)

        # 2. Remove stop words and extra spaces
        word_list = remove_stop_words(text)

        if stemming:
            # 3. Stemming
            word_list = get_word_stemm(word_list)

        joined_text = " ".join(word_list)
        text_list.append(joined_text)
        
    return text_list


In [8]:
data = preprocess_data(df.Text.values)

In [9]:
data

['standing still',
 'three programs quite moderate',
 'proposal advanced mr javits would cost six hundred millions dollars mr rockefeller rejected new york said agree financing said ought social security',
 'put deficit treasury',
 'third medical care aged tied social security financed social security funds',
 'say majority republicans opposed',
 'say majority',
 'say democrats united support program',
 'extreme bill yet could get one republican join least think four eight democrats voted send floor house one republican joined democrats opposed',
 'fact matter bill less recommended mr nixon morning proposal',
 'uh defeat teacher salaries bill uh met opinion need',
 'secondly federal aid education bill',
 'think extreme yet nearly two thirds three fourths republicans house representatives voted proposal',
 'one bill dollar twenty five cents hour anyone works store company million dollars year business',
 'well let look bills vice president suggests extreme',
 'reason president eisenhowe

In [10]:
# add clean text to dataframe
df["Clean_text"] = pd.Series(data)

In [11]:
df.head()

Unnamed: 0_level_0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,date,mos_before_election,Clean_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,8211,"Now, this is not standing still.",Richard M. Nixon,Vice President,REPUBLICAN,1960-09-26.txt,6,114,-0.41784,-1,1960-09-26,2,standing still
1,8515,So these are three programs which are quite mo...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,418,0.249581,-1,1960-09-26,2,three programs quite moderate
2,8514,The proposal advanced by you and by Mr. Javits...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,42,417,-0.626563,1,1960-09-26,2,proposal advanced mr javits would cost six hun...
3,8513,It does not put a deficit on the Treasury.,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,416,-0.629486,1,1960-09-26,2,put deficit treasury
4,8512,The third is medical care for the aged which i...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,22,415,0.0,-1,1960-09-26,2,third medical care aged tied social security f...


In [12]:
df_new = df.loc[:, ['date', 'Text', 'Clean_text', 'Verdict']]


In [13]:
df_new.to_csv('data.csv',  index=False)

## With stemming 

In [14]:
data1 = preprocess_data(df.Text.values, stemming=True)

In [15]:
# add clean text to dataframe
df.drop(['Clean_text'], axis=1, inplace=True)
df["Clean_text"] = pd.Series(data1)
df_new = df.loc[:, ['date', 'Text', 'Clean_text', 'Verdict']]
df_new.to_csv('data_stemmed.csv')