# 2020 Önálló kutatási feladat

In [9]:
import pandas as pd
import re

## Stock data preprocessing

In [10]:
stock = pd.read_csv('..\Adatok\stock-data\SP500.csv')

stock = stock.drop(columns=['High','Low','Adj Close','Volume'])
stock['Date'] = pd.to_datetime(stock['Date']).dt.date

def Change(row):
    return (row['Close']-row['Open'])

stock['Change'] = stock.apply (lambda row: Change(row), axis=1)

def label(row, threshold): 
    if abs(row['Change']) > threshold:
        if row['Change'] > 0:
            return int(1)
        else:
            return int(-1)
    else:
        return int(0)


def generate_data(sensitivity):
    stock['Label'] = stock.apply (lambda row: label(row, sensitivity), axis=1)
    stock_ready = stock.drop(columns=['Open', 'Close', 'Change'])
    stock_ready = stock_ready[stock_ready.Label != 0]
    return stock_ready 

stock

Unnamed: 0,Date,Open,Close,Change
0,2000-01-03,1469.250000,1455.219971,-14.030029
1,2000-01-04,1455.219971,1399.420044,-55.799927
2,2000-01-05,1399.420044,1402.109985,2.689941
3,2000-01-06,1402.109985,1403.449951,1.339966
4,2000-01-07,1403.449951,1441.469971,38.020020
...,...,...,...,...
5164,2020-07-14,3141.110107,3197.520020,56.409913
5165,2020-07-15,3225.979980,3226.560059,0.580079
5166,2020-07-16,3208.360107,3215.570068,7.209961
5167,2020-07-17,3224.209961,3224.729980,0.520019


### Itt tudjuk a *generate_data* függvény paraméterével állítani, hogy milyen mértékű változást veszünk figyelembe.

In [11]:
stock_labeled = generate_data(2)
stock_labeled

Unnamed: 0,Date,Label
0,2000-01-03,-1
1,2000-01-04,-1
2,2000-01-05,1
4,2000-01-07,1
5,2000-01-10,1
...,...,...
5162,2020-07-10,1
5163,2020-07-13,-1
5164,2020-07-14,1
5166,2020-07-16,1


## Twitteres adatok preprocessing

In [12]:
#Obama
tweets_dem = pd.read_csv('..\Adatok\obama\obama-tweets.csv')
tweets_dem = tweets_dem.drop(columns=['Username', 'Tweet Link', 'Retweets', 'Likes', 'TweetImageUrl', 'Image'])
tweets_dem['Date'] = pd.to_datetime(tweets_dem['Date'], format='%Y/%m/%d_%H:%M')
tweets_dem['Date'] = tweets_dem['Date'].dt.date
tweets_dem = tweets_dem[tweets_dem.Date < pd.to_datetime('2017-01-21')]
tweets_dem= tweets_dem.sort_index(ascending=False, axis=0)
tweets_dem.reset_index(drop=True,inplace=True)
tweets_dem.insert(2, "President", "Obama", allow_duplicates=False)

tweets_dem

Unnamed: 0,Date,Tweet-text,President
0,2012-11-05,"President Obama tells the story of ""Fired up! ...",Obama
1,2012-11-06,Election Day is here! Confirm your polling pla...,Obama
2,2012-11-06,It’s Election Day! This is your last chance to...,Obama
3,2012-11-06,At the final rally of his final campaign last ...,Obama
4,2012-11-06,25 reasons that 25 people are voting for Presi...,Obama
...,...,...,...
6673,2016-11-04,Let's keep working to keep our economy on a be...,Obama
6674,2016-11-05,In the weekly address President Obama discusse...,Obama
6675,2017-01-10,Tonight President Obama reflects on eight year...,Obama
6676,2017-01-20,Hi everybody! Back to the original handle. Is ...,Obama


In [13]:
#Trump
tweets_gop = pd.read_csv('../Adatok/trump/trump-tweets.csv')
tweets_gop = tweets_gop.drop(columns=['id', 'link', 'retweets', 'favorites', 'mentions', 'hashtags'])
tweets_gop['date'] = pd.to_datetime(tweets_gop['date'])
tweets_gop['date'] = tweets_gop['date'].dt.date
tweets_gop = tweets_gop[['date', 'content']]
tweets_gop = tweets_gop[ pd.to_datetime('2017-01-21') <= tweets_gop.date]
tweets_gop.reset_index(drop=True,inplace=True)
tweets_gop.columns = ["Date","Tweet-text"]
tweets_gop.insert(2, "President", "Trump", allow_duplicates=False)

tweets_gop

Unnamed: 0,Date,Tweet-text,President
0,2017-01-21,A fantastic day and evening in Washington D.C....,Trump
1,2017-01-22,Had a great meeting at CIA Headquarters yester...,Trump
2,2017-01-22,Watched protests yesterday but was under the i...,Trump
3,2017-01-22,"Wow, television ratings just out: 31 million p...",Trump
4,2017-01-22,Peaceful protests are a hallmark of our democr...,Trump
...,...,...,...
12086,2020-06-17,Joe Biden was a TOTAL FAILURE in Government. H...,Trump
12087,2020-06-17,Will be interviewed on @ seanhannity tonight a...,Trump
12088,2020-06-17,pic.twitter.com/3lm1spbU8X,Trump
12089,2020-06-17,pic.twitter.com/vpCE5MadUz,Trump


## Táblák joinolása

In [14]:
joined_data = pd.concat([tweets_dem,tweets_gop],ignore_index=True)
joined_data = joined_data.merge(stock_labeled, how='inner', left_on='Date', right_on='Date')

def handlehandler(row):
    return re.sub(r'@ (\w){1,15}', " ", row['Tweet-text'] )

# Az obama tweetekben nincsenek ' '-ek az @ és a handle között, hogy lehet opcionálisan egy space-t hagyni? 

def http(row):
    return re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', " ", row['Tweet-text'] )

def urls(row):
    return re.sub(r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/=]*)', " ", row['Tweet-text'] )

joined_data['Tweet-text'] = joined_data.apply (lambda row: handlehandler(row), axis=1)
joined_data['Tweet-text'] = joined_data.apply (lambda row: http(row), axis=1)
joined_data['Tweet-text'] = joined_data.apply (lambda row: urls(row), axis=1)

joined_data = joined_data.rename(columns={'Tweet-text': 'text', 'Date': 'date', 'Label': 'label', 'President': 'pres'})

joined_data = joined_data[joined_data.text != " " ]

joined_data.tail()

Unnamed: 0,date,text,pres,label
12353,2020-06-16,True! …,Trump,-1
12354,2020-06-16,A GREAT woman. Her son is looking down from he...,Trump,-1
12355,2020-06-16,96% Approval Rating in the Republican Party. T...,Trump,-1
12356,2020-06-17,Joe Biden was a TOTAL FAILURE in Government. H...,Trump,-1
12357,2020-06-17,Will be interviewed on tonight at 9:00 Enjoy!,Trump,-1
