# Data Munging
This file processes the raw data that we get from the api to make it more useful for the analysis

In [1]:
import pandas as pd

In [2]:
#read in data
df=pd.read_csv('Data/stories.csv')

### Removing unwanted rows

In [3]:
#remove rows without tags
df=df.loc[df['tags']!='[]']

In [4]:
#remove NaN descriptions
df=df.loc[[len(str(x))>4 for x in df['description']]]


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,categories,commentCount,completed,copyright,cover,cover_timestamp,createDate,deleted,description,...,parts,rating,readCount,tags,title,url,user.avatar,user.fullname,user.name,voteCount
0,0,19,5187,True,1,https://a.wattpad.com/cover/12050237-256-k9918...,2014-01-29T18:37:29Z,2014-01-29T16:24:42Z,False,"Fina is a Enforcers Daughter, she's been harsh...",...,"[{'id': 36978107, 'title': 'Prologue', 'url': ...",3,2686300,"['alpha', 'curse', 'luna', 'magic', 'power', '...",Hadar #Watty2014,https://www.wattpad.com/story/12050237-hadar-w...,https://a.wattpad.com/useravatar/RevyRogue.128...,Reviana,RevyRogue,106794
2,2,19,9715,True,0,https://a.wattpad.com/cover/42937930-256-k9921...,2015-06-22T23:55:46Z,2015-06-22T23:55:09Z,False,yessss!!!!,...,"[{'id': 141082265, 'title': 'lol!', 'url': 'ht...",0,1339809,"['dumb', 'funny', 'listen', 'nothing', 'okay']",❇awsome comebacks❇,https://www.wattpad.com/story/42937930-%E2%9D%...,https://a.wattpad.com/useravatar/kittygirl_121...,,kittygirl_121604,37159
5,5,19,4509,True,1,https://a.wattpad.com/cover/23303650-256-k3456...,2015-01-04T12:28:05Z,2014-09-14T10:33:07Z,False,Hopefully this book makes your life a little e...,...,"[{'id': 77721987, 'title': 'HIII', 'url': 'htt...",1,902903,"['easy', 'hacks', 'life', 'lige', 'like']",Life Hacks!!,https://www.wattpad.com/story/23303650-life-hacks,https://a.wattpad.com/useravatar/UglyNugget.12...,,UglyNugget,20995
6,6,19,3385,False,0,https://a.wattpad.com/cover/7346101-256-k45569...,2013-08-02T22:01:32Z,2013-08-02T21:29:34Z,False,Alexzandria (Alex) Parker had a difficult fami...,...,"[{'id': 22313394, 'title': 'Behind closed door...",3,736788,"['student', 'teacher']",Behind closed doors (teacher/student romance),https://www.wattpad.com/story/7346101-behind-c...,https://a.wattpad.com/useravatar/PissedImNotAM...,,PissedImNotAMermaid,13646
7,7,19,2798,True,0,https://a.wattpad.com/cover/8897341-256-k90366...,2014-10-21T22:01:08Z,2013-10-10T17:35:33Z,False,Meet Kyleigh Hawthorne. She's The New Girl At ...,...,"[{'id': 27208138, 'title': 'Changed', 'url': '...",3,663083,"['brothers', 'drama', 'girl', 'high', 'near-de...",Changed,https://www.wattpad.com/story/8897341-changed,https://a.wattpad.com/useravatar/Sugar-Quills....,,Sugar-Quills,13549


### Removing unwanted columns
We have some unwanted columns in the table so we will remove them and create a stripped down version of the data frame that we will work with for the analysis. 
The following columns are removed:
* Unnamed: 0
* copyright
* cover
* cover_timestamp
* firstPartId
* firstPublishedPart.createDate
* firstPublishedPart.id
* lastPublishedPart.createDate
* lastPublishedPart.id
* parts

In [39]:
df.columns

Index(['Unnamed: 0', 'categories', 'commentCount', 'completed', 'copyright',
       'cover', 'cover_timestamp', 'createDate', 'deleted', 'description',
       'firstPartId', 'firstPublishedPart.createDate', 'firstPublishedPart.id',
       'id', 'language.id', 'language.name', 'lastPublishedPart.createDate',
       'lastPublishedPart.id', 'length', 'mature', 'modifyDate', 'numParts',
       'parts', 'rating', 'readCount', 'tags', 'title', 'url', 'user.avatar',
       'user.fullname', 'user.name', 'voteCount'],
      dtype='object')

In [40]:
# remove the unwanted columns to minimize clutter
stripped_df = df[["categories","commentCount","completed",
                  "createDate","deleted","description","id", 
                  "language.id","language.name","length","mature",
                  "modifyDate","numParts","rating","readCount",
                  "tags","title","url","user.avatar","user.fullname",
                  "user.name","voteCount"]]

### Merging with the categories data
The categories in the stories list is listed as an id so we need to get the corresponding category names into this dataframe. We will also rename and reorder the columns to make more logical sense

In [42]:
categories_df=pd.read_csv('Data/categories.csv')
categories_df

Unnamed: 0,ID,NAME
0,4,Romance
1,5,Science Fiction
2,3,Fantasy
3,7,Humor
4,12,Paranormal
5,8,Mystery / Thriller
6,9,Horror
7,11,Adventure
8,23,Historical Fiction
9,1,Teen Fiction


In [46]:
# Add a new column to put in the corresponding category names
stripped_df["categoryName"] = stripped_df["categories"]
stripped_df["categoryName"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


array([19])

In [47]:
# Function to get the category name from the categories_df given the category id
def get_category(x):
    name = categories_df.loc[categories_df["ID"]==int(x),"NAME"]
    return name.iloc[0]

# Replace the id with name in the categoryName column
stripped_df["categoryName"] = stripped_df["categoryName"].apply(get_category)
stripped_df["categoryName"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


array(['Random'], dtype=object)

In [48]:
# rename some columns to keep the column name format consistent
stripped_df = stripped_df.rename(columns={"categories":"categoryId",
                            "language.id": "languageId",
                            "language.name": "languageName",
                            "user.avatar": "userAvatar",
                            "user.fullname":"userFullname",
                            "user.name":"userName"})

# reorder the columns
stripped_df = stripped_df[["id","title","description","url","createDate",
                          "modifyDate","completed","numParts","deleted","length",
                          "categoryId","categoryName","languageId","languageName","mature","rating",
                          "tags","commentCount","readCount","voteCount","userAvatar","userFullname","userName"]]
stripped_df.head()

Unnamed: 0,id,title,description,url,createDate,modifyDate,completed,numParts,deleted,length,...,languageName,mature,rating,tags,commentCount,readCount,voteCount,userAvatar,userFullname,userName
0,12050237,Hadar #Watty2014,"Fina is a Enforcers Daughter, she's been harsh...",https://www.wattpad.com/story/12050237-hadar-w...,2014-01-29T16:24:42Z,2015-01-27T10:47:41Z,True,37,False,407525,...,English,False,3,"['alpha', 'curse', 'luna', 'magic', 'power', '...",5187,2686300,106794,https://a.wattpad.com/useravatar/RevyRogue.128...,Reviana,RevyRogue
2,42937930,❇awsome comebacks❇,yessss!!!!,https://www.wattpad.com/story/42937930-%E2%9D%...,2015-06-22T23:55:09Z,2015-08-16T08:07:12Z,True,16,False,2204,...,English,False,0,"['dumb', 'funny', 'listen', 'nothing', 'okay']",9715,1339809,37159,https://a.wattpad.com/useravatar/kittygirl_121...,,kittygirl_121604
5,23303650,Life Hacks!!,Hopefully this book makes your life a little e...,https://www.wattpad.com/story/23303650-life-hacks,2014-09-14T10:33:07Z,2015-01-27T09:58:01Z,True,89,False,17106,...,English,False,1,"['easy', 'hacks', 'life', 'lige', 'like']",4509,902903,20995,https://a.wattpad.com/useravatar/UglyNugget.12...,,UglyNugget
6,7346101,Behind closed doors (teacher/student romance),Alexzandria (Alex) Parker had a difficult fami...,https://www.wattpad.com/story/7346101-behind-c...,2013-08-02T21:29:34Z,2015-08-29T23:44:05Z,False,14,False,38084,...,English,False,3,"['student', 'teacher']",3385,736788,13646,https://a.wattpad.com/useravatar/PissedImNotAM...,,PissedImNotAMermaid
7,8897341,Changed,Meet Kyleigh Hawthorne. She's The New Girl At ...,https://www.wattpad.com/story/8897341-changed,2013-10-10T17:35:33Z,2015-11-04T05:48:53Z,True,31,False,235195,...,English,False,3,"['brothers', 'drama', 'girl', 'high', 'near-de...",2798,663083,13549,https://a.wattpad.com/useravatar/Sugar-Quills....,,Sugar-Quills


### Saving the clean data to a new csv file

In [49]:
# save the data frame to a csv file to be used for vizualizations later
stripped_df.to_csv("./Data/stories_for_viz.csv", index=False)