# Delta Practice

### File Read In

In [148]:
import pandas as pd
from datetime import timedelta

filepath1 = r'/Users/cartersocha/Downloads/instgramHashtagCounts.xlsx'
instaHashtagDf = pd.read_excel(filepath1)

filepath2 = r'/Users/cartersocha/Downloads/tweetCountTest.xlsx'
dailyTweetDf = pd.read_excel(filepath2)

filepath3 = r'/Users/cartersocha/Desktop/ReleaseData.xlsx'
releaseDf = pd.read_excel(filepath3, "ShowInfoEndStart")

filepath4 = r'/Users/cartersocha/Downloads/instgramAccountCounts.xlsx'
igAccountDf = pd.read_excel(filepath4)

filepath5 = r'/Users/cartersocha/Downloads/redditCountTest.xlsx'
redditSubsDf = pd.read_excel(filepath5)

filepath6 = r'/Users/cartersocha/Downloads/redditCommentCombo.csv'
redditCommentsDf = pd.read_csv(filepath6)

filepath8 = r'/Users/cartersocha/Desktop/ReleaseData.xlsx'
releaseDateDf = pd.read_excel(filepath8, "ReleaseDateData")

filepath10 = r'/Users/cartersocha/Downloads/googleDataset2.csv'
googleDailyData = pd.read_csv(filepath10)

### Functions

In [149]:
def DateTimeConvert(dateDf, dateColumn):
    dateDf[dateColumn] = pd.to_datetime(dateDf[dateColumn])  

    return dateDf

In [150]:
def DiffMaker(fillnaDf, valueColumn, dateColumn):
    fillnaDf.sort_values(['TvShow', dateColumn], inplace=True)

    fillnaDf['diffs'] = fillnaDf.groupby(['TvShow'])[valueColumn].transform(lambda x: x.diff()).fillna(0)

    fillnaDf.sort_index(inplace=True)

    return fillnaDf

In [151]:
def RemoveData(release,showDf):

    bigDf = pd.DataFrame()

    for i in range(len(release)):
        show = release['TvShow'][i]
        firstDate = release['Release Date'][i]
        secondDate = release['90DayDate'][i]

        smallDf = showDf[showDf['TvShow'] == show]

        newdf = smallDf[smallDf['RunDate'].between(firstDate, secondDate)]

        bigDf = bigDf.append(newdf,ignore_index=True)

    return bigDf

In [152]:
def MergeDfs(mainDf, secondDf, columnName):

    merged = pd.merge(mainDf,secondDf, how='outer', on=columnName)
    return merged

In [153]:
import numpy as np

def NegativeDiffs(diffDf,columnName):

    diffDf['zeroedDiffs'] = np.where((diffDf[columnName] < 0), 0, diffDf[columnName])

    return diffDf

In [154]:
def SumSocialMedia(sumDf,columnName,trigger):
    # check to see if this is episode count or generic summarization
    if trigger == 1:
        summarizedDf = sumDf.groupby(columnName, as_index=False).sum()
        summarizedDf = pd.DataFrame(summarizedDf)
    
    else:
        summarizedDf = sumDf.groupby(columnName, as_index=False).count()
        summarizedDf = pd.DataFrame(summarizedDf)
    
    return summarizedDf

In [155]:
from sklearn.preprocessing import MinMaxScaler

def NormalizeData(normalDf,columnName):

    # define min max scaler
    scaler = MinMaxScaler()
    # transform data
    scaledSeries = scaler.fit_transform(normalDf[[columnName]])
    
    return scaledSeries

### Data Transformations

##### Release Data

In [156]:
releaseDf['90DayDate'] = releaseDf['Release Date'] + pd.DateOffset(days=90)

releaseDf = DateTimeConvert(releaseDf,'90DayDate')
releaseDf = DateTimeConvert(releaseDf,'Release Date')

##### Google Data

In [157]:
googleDailyData = DateTimeConvert(googleDailyData,'RunDate')
googDf1 = RemoveData(releaseDf, googleDailyData)
googDf1 = NegativeDiffs(googDf1, 'GoogleValue')

googDf1['GoogleValue'] = googDf1['zeroedDiffs']

googDf1['ScaledDataAdjust'] = NormalizeData(googDf1,'GoogleValue')

googDf1['ScaledGoogleValue'] = googDf1['ScaledDataAdjust']

googDf1 = googDf1[['TvShow','RunDate','GoogleValue','ScaledGoogleValue']]

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Instagram Hashtag Data

In [158]:
instaHashtagDf = DateTimeConvert(instaHashtagDf,'RunDate')

instaHashtagDf = DiffMaker(instaHashtagDf,'HashtagValue','RunDate')

instaHashDf = RemoveData(releaseDf, instaHashtagDf)
instaHashDf = NegativeDiffs(instaHashDf, 'diffs')
instaHashDf['SocialMediaSource'] = 'InstagramHashtag'
instaHashDf['SocialMediaValue'] = instaHashDf['zeroedDiffs']

instaHashDf = MergeDfs(instaHashDf, googDf1,['TvShow','RunDate'])
instaHashDf = instaHashDf.fillna(0)
instaHashDf = instaHashDf[instaHashDf['SocialMediaSource'] != 0]
instaHashDf['GoogleAdjustedSocial'] = (instaHashDf['SocialMediaValue'] * instaHashDf['ScaledGoogleValue']) + instaHashDf['SocialMediaValue']

instaHashDf['ScaledDataAdjustGoogle'] = NormalizeData(instaHashDf,'GoogleAdjustedSocial')
instaHashDf['ScaledDataAdjust'] = NormalizeData(instaHashDf,'SocialMediaValue')

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Twitter Data

In [159]:
dailyTweetDf = DateTimeConvert(dailyTweetDf,'RunDate')
dailyTweetDf = RemoveData(releaseDf, dailyTweetDf)
dailyTweetDf['SocialMediaValue'] = dailyTweetDf['TweetCount'] + dailyTweetDf['RetweetCount']
dailyTweetDf['SocialMediaSource'] = 'Tweets'

dailyTweetDf = MergeDfs(dailyTweetDf, googDf1,['TvShow','RunDate'])
dailyTweetDf = dailyTweetDf.fillna(0)
dailyTweetDf = dailyTweetDf[dailyTweetDf['SocialMediaSource'] != 0]
dailyTweetDf['GoogleAdjustedSocial'] = (dailyTweetDf['SocialMediaValue'] * dailyTweetDf['ScaledGoogleValue']) + dailyTweetDf['SocialMediaValue']

dailyTweetDf['ScaledDataAdjustGoogle'] = NormalizeData(dailyTweetDf,'GoogleAdjustedSocial')
dailyTweetDf['ScaledDataAdjust'] = NormalizeData(dailyTweetDf,'SocialMediaValue')

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Reddit Comment Data

In [160]:
redditCommentsDf = DateTimeConvert(redditCommentsDf,'RunDate')
redCommentDf = RemoveData(releaseDf, redditCommentsDf)
redCommentDf['SocialMediaValue'] = redCommentDf['NumComments'] + redCommentDf['score']

redCommentDf = redCommentDf[['TvShow','RunDate','SocialMediaValue']]
iRedCommentDf = SumSocialMedia(redCommentDf, ['TvShow','RunDate'],1)
iRedCommentDf['SocialMediaSource'] = 'RedditComments'

iRedCommentDf = MergeDfs(iRedCommentDf, googDf1,['TvShow','RunDate'])
iRedCommentDf = iRedCommentDf.fillna(0)
iRedCommentDf = iRedCommentDf[iRedCommentDf['SocialMediaSource'] != 0]
iRedCommentDf['GoogleAdjustedSocial'] = (iRedCommentDf['SocialMediaValue'] * iRedCommentDf['ScaledGoogleValue']) + iRedCommentDf['SocialMediaValue']

iRedCommentDf['ScaledDataAdjustGoogle'] = NormalizeData(iRedCommentDf,'GoogleAdjustedSocial')
iRedCommentDf['ScaledDataAdjust'] = NormalizeData(iRedCommentDf,'SocialMediaValue')

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Instagram Account Data

In [161]:
igAccountDf = DateTimeConvert(igAccountDf,'RunDate')

igAccountDf = DiffMaker(igAccountDf,'IgAccountCounts','RunDate')


instaAccDf = RemoveData(releaseDf, igAccountDf)
instaAccDf = NegativeDiffs(instaAccDf, 'diffs')
instaAccDf['SocialMediaSource'] = 'InstagramAccount'
instaAccDf['SocialMediaValue'] = instaAccDf['zeroedDiffs']

instaAccDf = MergeDfs(instaAccDf, googDf1,['TvShow','RunDate'])
instaAccDf = instaAccDf.fillna(0)
instaAccDf = instaAccDf[instaAccDf['SocialMediaSource'] != 0]
instaAccDf['GoogleAdjustedSocial'] = (instaAccDf['SocialMediaValue'] * instaAccDf['ScaledGoogleValue']) + instaAccDf['SocialMediaValue']

instaAccDf['ScaledDataAdjustGoogle'] = NormalizeData(instaAccDf,'GoogleAdjustedSocial')
instaAccDf['ScaledDataAdjust'] = NormalizeData(instaAccDf,'SocialMediaValue')

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Reddit Subscriber Data

In [162]:
redditSubsDf = DateTimeConvert(redditSubsDf,'RunDate')

redditSubsDf = DiffMaker(redditSubsDf,'RedditSubs','RunDate')

redSubDf = RemoveData(releaseDf, redditSubsDf)
redSubDf = NegativeDiffs(redSubDf, 'diffs')
redSubDf['SocialMediaSource'] = 'RedditSubscribers'
redSubDf['SocialMediaValue'] = redSubDf['zeroedDiffs']

redSubDf = MergeDfs(redSubDf, googDf1,['TvShow','RunDate'])
redSubDf = redSubDf.fillna(0)
redSubDf = redSubDf[redSubDf['SocialMediaSource'] != 0]
redSubDf['GoogleAdjustedSocial'] = (redSubDf['SocialMediaValue'] * redSubDf['ScaledGoogleValue']) + redSubDf['SocialMediaValue']

redSubDf['ScaledDataAdjustGoogle'] = NormalizeData(redSubDf,'GoogleAdjustedSocial')
redSubDf['ScaledDataAdjust'] = NormalizeData(redSubDf,'SocialMediaValue')

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Release Date Data

In [163]:
releaseDateDf = DateTimeConvert(releaseDateDf,'EpisodeReleaseDate')
iReleaseData = SumSocialMedia(releaseDateDf, ['TvShow','EpisodeReleaseDate'],0)
iReleaseData['DailyReleaseCount'] = iReleaseData['EpisodeNumber']
iReleaseData['RunDate'] = iReleaseData['EpisodeReleaseDate']

##### Join the Data

##### Time Value Datasets

In [164]:
instaAccDf = instaAccDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource','ScaledDataAdjust','GoogleAdjustedSocial','ScaledDataAdjustGoogle']]#,'diffs']]
redSubDf = redSubDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource','ScaledDataAdjust','GoogleAdjustedSocial','ScaledDataAdjustGoogle']]#,'diffs']]
dailyTweetDf = dailyTweetDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource','ScaledDataAdjust','GoogleAdjustedSocial','ScaledDataAdjustGoogle']]
instaHashDf = instaHashDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource','ScaledDataAdjust','GoogleAdjustedSocial','ScaledDataAdjustGoogle']]#,'diffs']]
# reddit comment data is combined elsewhere
iReleaseData = iReleaseData[['TvShow','RunDate','DailyReleaseCount']]

In [165]:
uberDf2 = pd.concat([instaAccDf,redSubDf,dailyTweetDf,instaHashDf,iRedCommentDf])#,googDf1])

uberDf3 = MergeDfs(uberDf2, releaseDf,'TvShow')
uberDf3

uberDf3 = MergeDfs(uberDf3, iReleaseData,['TvShow','RunDate'])
uberDf3 = uberDf3[uberDf3['Ignore'] == 1]
uberDf3 = uberDf3.fillna(0)
uberDf3['PostReleaseDay'] = (uberDf3['RunDate'] - uberDf3['Release Date']).astype(str).str.replace(' days','').astype(int)

### Output - individuals & uber dataset

In [166]:
uberDf3 = uberDf3.sort_values(['TvShow','RunDate','SocialMediaSource'])

In [167]:
uberdf4= uberDf3[uberDf3['PostReleaseDay'] <= 90]

In [168]:
#uberdf5 = MergeDfs(uberdf4, googDf1,['TvShow','RunDate'])
uberdf5= uberdf4[uberdf4['PostReleaseDay'] <= 90].fillna(0)

filepath2 = r'/Users/cartersocha/Downloads/uberDataset3.csv'

uberdf5.to_csv(filepath2) 