# Delta Practice

### File Read In

In [44]:
import pandas as pd

filepath1 = r'/Users/cartersocha/Downloads/instgramHashtagCounts.xlsx'
instaHashtagDf = pd.read_excel(filepath1)

filepath2 = r'/Users/cartersocha/Downloads/tweetCountTest.xlsx'
dailyTweetDf = pd.read_excel(filepath2)

filepath3 = r'/Users/cartersocha/Desktop/ReleaseData.xlsx'
releaseDf = pd.read_excel(filepath3, "ShowInfoEndStart")

filepath4 = r'/Users/cartersocha/Downloads/instgramAccountCounts.xlsx'
igAccountDf = pd.read_excel(filepath4)

filepath5 = r'/Users/cartersocha/Downloads/redditCountTest.xlsx'
redditSubsDf = pd.read_excel(filepath5)

filepath6 = r'/Users/cartersocha/Downloads/redditCountFinal.txt'
#redditCommentsDf = pd.read_excel(filepath6)

filepath7 = r'/Users/cartersocha/Downloads/googleTvCount.xlsx'
googleTrendsDf = pd.read_excel(filepath7)

filepath8 = r'/Users/cartersocha/Desktop/ReleaseData.xlsx'
releaseDateDf = pd.read_excel(filepath8, "ReleaseDateData")

### Functions

In [45]:
def DateTimeConvert(dateDf, dateColumn):
    dateDf[dateColumn] = pd.to_datetime(dateDf[dateColumn])  

    return dateDf

In [46]:
def DiffMaker(fillnaDf, valueColumn, dateColumn):
    fillnaDf.sort_values(['TvShow', dateColumn], inplace=True)

    fillnaDf['diffs'] = fillnaDf.groupby(['TvShow'])[valueColumn].transform(lambda x: x.diff()).fillna(0)

    fillnaDf.sort_index(inplace=True)

    return fillnaDf

In [47]:
def RemoveData(release,showDf):

    bigDf = pd.DataFrame()

    for i in range(len(release)):
        show = release['TvShow'][i]
        firstDate = release['Release Date'][i]
        secondDate = release['90DayDate'][i]

        smallDf = showDf[showDf['TvShow'] == show]

        newdf = smallDf[smallDf['RunDate'].between(firstDate, secondDate)]

        bigDf = bigDf.append(newdf,ignore_index=True)

    return bigDf

In [48]:
def MergeDfs(mainDf, secondDf, columnName):

    merged = pd.merge(mainDf,secondDf, how='outer', on=columnName)
    return merged

In [49]:
import numpy as np

def NegativeDiffs(diffDf,columnName):

    diffDf['zeroedDiffs'] = np.where((diffDf[columnName] < 0), 0, diffDf[columnName])

    return diffDf

In [50]:
def SumSocialMedia(sumDf,columnName):
    # check to see if this is episode count or generic summarization
    if type(columnName) != list:
        summarizedDf = sumDf.groupby(columnName, as_index=False).sum()
        summarizedDf = pd.DataFrame(summarizedDf)
    
    summarizedDf = sumDf.groupby(columnName, as_index=False).count()
    summarizedDf = pd.DataFrame(summarizedDf)
    
    return summarizedDf

### Data Transformations

##### Release Data

In [51]:
releaseDf['90DayDate'] = releaseDf['Release Date'] + pd.DateOffset(days=90)

releaseDf = DateTimeConvert(releaseDf,'90DayDate')
releaseDf = DateTimeConvert(releaseDf,'Release Date')

##### Instagram Hashtag Data

In [52]:
instaHashtagDf = DateTimeConvert(instaHashtagDf,'RunDate')

instaHashtagDf = DiffMaker(instaHashtagDf,'HashtagValue','RunDate')

instaHashDf = RemoveData(releaseDf, instaHashtagDf)
instaHashDf = NegativeDiffs(instaHashDf, 'diffs')
instaHashDf['SocialMediaSource'] = 'InstagramHashtag'
instaHashDf['SocialMediaValue'] = instaHashDf['zeroedDiffs']

iHashtagDf = SumSocialMedia(instaHashDf, 'TvShow')

iHashtagDf['SocialMediaValue'] = iHashtagDf['zeroedDiffs']
iHashtagDf['SocialMediaSource'] = 'InstagramHashtag'

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Twitter Data

In [53]:
dailyTweetDf = DateTimeConvert(dailyTweetDf,'RunDate')
dailyTweetDf = RemoveData(releaseDf, dailyTweetDf)
dailyTweetDf['SocialMediaValue'] = dailyTweetDf['TweetCount'] + dailyTweetDf['RetweetCount']
dailyTweetDf['SocialMediaSource'] = 'Tweets'

iTweetCountDf = SumSocialMedia(dailyTweetDf, 'TvShow')
iTweetCountDf['SocialMediaSource'] = 'Tweets'

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Reddit Comment Data

redditCommentsDf = DateTimeConvert(redditCommentsDf,'runDate')

redCommentDf = RemoveData(releaseDf, redditCommentsDf)


iRedCommentDf = SumSocialMedia(redCommentDf, 'TvShow')
iRedCommentDf['SocialMediaValue'] = redCommentDf['NumComments'] + redCommentDf['score']
iRedCommentDf['SocialMediaSource'] = 'RedditComments'

##### Instagram Account Data

In [54]:
igAccountDf = DateTimeConvert(igAccountDf,'RunDate')

igAccountDf = DiffMaker(igAccountDf,'IgAccountCounts','RunDate')


instaAccDf = RemoveData(releaseDf, igAccountDf)
instaAccDf = NegativeDiffs(instaAccDf, 'diffs')
instaAccDf['SocialMediaSource'] = 'InstagramAccount'
instaAccDf['SocialMediaValue'] = instaAccDf['zeroedDiffs']

iAccountDf = SumSocialMedia(instaAccDf, 'TvShow')

iAccountDf['SocialMediaValue'] = iAccountDf['zeroedDiffs']
iAccountDf['SocialMediaSource'] = 'InstagramAccount'

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Reddit Subscriber Data

In [55]:
redditSubsDf = DateTimeConvert(redditSubsDf,'RunDate')

redditSubsDf = DiffMaker(redditSubsDf,'RedditSubs','RunDate')

redSubDf = RemoveData(releaseDf, redditSubsDf)
redSubDf = NegativeDiffs(redSubDf, 'diffs')
redSubDf['SocialMediaSource'] = 'RedditSubscribers'
redSubDf['SocialMediaValue'] = redSubDf['zeroedDiffs']

iRedSubDf = SumSocialMedia(redSubDf, 'TvShow')
iRedSubDf['SocialMediaValue'] = iRedSubDf['zeroedDiffs']
iRedSubDf['SocialMediaSource'] = 'RedditSubscribers'

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Google Data

In [56]:
googleTrendsDf = DateTimeConvert(googleTrendsDf,'RunDate')

googDf = RemoveData(releaseDf, googleTrendsDf)
googDf = NegativeDiffs(googDf, 'GoogleValue')
googDf['SocialMediaSource'] = 'Google'
googDf['SocialMediaValue'] = googDf['zeroedDiffs']

  bigDf = bigDf.append(newdf,ignore_index=True)


##### Release Date Data

In [57]:
releaseDateDf = DateTimeConvert(releaseDateDf,'EpisodeReleaseDate')
iReleaseData = SumSocialMedia(releaseDateDf, ['TvShow','EpisodeReleaseDate'])
iReleaseData['DailyReleaseCount'] = iReleaseData['EpisodeNumber']
iReleaseData['RunDate'] = iReleaseData['EpisodeReleaseDate']

##### Join the Data

##### Time Value Datasets

In [58]:
instaAccDf = instaAccDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource','diffs']]
redSubDf = redSubDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource','diffs']]
dailyTweetDf = dailyTweetDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource']]
instaHashDf = instaHashDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource','diffs']]
googDf = googDf[['TvShow','RunDate','SocialMediaValue','SocialMediaSource']]
iReleaseData = iReleaseData[['TvShow','RunDate','DailyReleaseCount']]
iReleaseData

Unnamed: 0,TvShow,RunDate,DailyReleaseCount
0,ABlackLadySketchShow,2022-04-08,1
1,ABlackLadySketchShow,2022-04-15,1
2,ABlackLadySketchShow,2022-04-22,1
3,ABlackLadySketchShow,2022-04-29,1
4,ABlackLadySketchShow,2022-05-06,1
...,...,...,...
292,theEssexSerpent,2022-05-13,2
293,theEssexSerpent,2022-05-20,1
294,theEssexSerpent,2022-05-27,1
295,theEssexSerpent,2022-06-03,1


In [59]:
uberDf2 = pd.concat([instaAccDf,redSubDf,dailyTweetDf,instaHashDf,googDf])

uberDf3 = MergeDfs(uberDf2, releaseDf,'TvShow')
uberDf3

uberDf3 = MergeDfs(uberDf3, iReleaseData,['TvShow','RunDate'])
uberDf3 = uberDf3[uberDf3['Ignore'] != 0]
uberDf3

Unnamed: 0,TvShow,RunDate,SocialMediaValue,SocialMediaSource,diffs,Stream,EpisodeCount,SeasonNumber,ReleaseCadence,Release Date,Ignore,ShowStatus,90DayDate,DailyReleaseCount
0,ABlackLadySketchShow,2022-04-08,175.0,InstagramAccount,175.0,HBOMax,6.0,3.0,Weekly,2022-04-08,,Renewed,2022-07-07,1.0
1,ABlackLadySketchShow,2022-04-08,154.0,Tweets,,HBOMax,6.0,3.0,Weekly,2022-04-08,,Renewed,2022-07-07,1.0
2,ABlackLadySketchShow,2022-04-08,12.0,InstagramHashtag,12.0,HBOMax,6.0,3.0,Weekly,2022-04-08,,Renewed,2022-07-07,1.0
3,ABlackLadySketchShow,2022-04-09,341.0,InstagramAccount,341.0,HBOMax,6.0,3.0,Weekly,2022-04-08,,Renewed,2022-07-07,
4,ABlackLadySketchShow,2022-04-09,599.0,Tweets,,HBOMax,6.0,3.0,Weekly,2022-04-08,,Renewed,2022-07-07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18066,TheAfterparty,2022-02-11,,,,,,,,NaT,,,NaT,1.0
18067,TheAfterparty,2022-02-18,,,,,,,,NaT,,,NaT,1.0
18068,TheAfterparty,2022-02-25,,,,,,,,NaT,,,NaT,1.0
18069,TheAfterparty,2022-03-04,,,,,,,,NaT,,,NaT,1.0


##### Aggregation Datasets

igAccount = iAccountDf[['TvShow','SocialMediaValue','SocialMediaSource']]
redditSub = iRedSubDf[['TvShow','SocialMediaValue','SocialMediaSource']]
tweetComments = iTweetCountDf[['TvShow','SocialMediaValue','SocialMediaSource']]
igHashtag = iHashtagDf[['TvShow','SocialMediaValue','SocialMediaSource']]

##### Dataset Merging

nuberData = pd.concat([igAccount,redditSub,tweetComments,igHashtag],axis=0)
nuberData.head()

uberDf = MergeDfs(releaseDf, iHashtagDf)

uberDf = MergeDfs(uberDf, iTweetCountDf)

uberDf.head()

### Output - individuals & uber dataset

In [60]:
filepath2 = r'/Users/cartersocha/Downloads/uberDataset.csv'

uberDf3.to_csv(filepath2) 