
# Data Cleaning and Modeling

- Use this data model to identify which datasets will be required to answer your business question - which is to to figure out the top 5 categories with the largest popularity.


- Needed columns :  content ID, category, content type, reaction type, and reaction score

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 

## Loading the data and identifying issues to clean the data 

<img src = "Schema.png" width=600 height=600>

In [5]:
content = pd.read_csv("Data/Content.csv")
reactions = pd.read_csv("Data/Reactions.csv")
reaction_types = pd.read_csv("Data/ReactionTypes.csv")

In [8]:
content.head()

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Category,URL
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


In [7]:
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   Content ID  1000 non-null   object
 2   User ID     1000 non-null   object
 3   Type        1000 non-null   object
 4   Category    1000 non-null   object
 5   URL         801 non-null    object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB


In [32]:
reactions_clean.content_ID.nunique()

962

In [17]:
content_clean = content[['Content ID', 'Type', 'Category']].copy().rename({'Type' : 'content_type', 'Content ID' : 'content_ID', 'Category':'category'}, axis = 1)


In [60]:
reactions_clean = reactions[['Content ID', 'Type', 'Datetime']].copy().rename({'Type' : 'reaction_type', 'Content ID' : 'content_ID', 'DateTime' : 'date'}, axis = 1).dropna(axis = 0)

In [21]:
reaction_t = reaction_types[['Type', 'Score']].copy().rename({'Type':'reaction_type', 'Score' : 'reaction_score'}, axis = 1)

In [52]:
def clean(x) : 
    x = x.replace('"', '')
    x = x.lower()
    return x

content_clean.category = content_clean.category.map(clean)

### So after cleaning the data frames individually I will merge them together, one Note is that content has 1000 unique content id, whereas reactions has 962

In [33]:
data = content_clean.merge(reactions_clean, on = 'content_ID').merge(reaction_t, on = 'reaction_type')

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24573 entries, 0 to 24572
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   content_ID      24573 non-null  object
 1   content_type    24573 non-null  object
 2   category        24573 non-null  object
 3   reaction_type   24573 non-null  object
 4   reaction_score  24573 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.1+ MB


In [39]:
data.content_ID.nunique()

962

In [36]:
data.reaction_type.unique()

array(['disgust', 'dislike', 'scared', 'interested', 'peeking', 'cherish',
       'hate', 'love', 'indifferent', 'super love', 'intrigued',
       'worried', 'like', 'heart', 'want', 'adore'], dtype=object)

In [57]:
data.content_type.unique()

array(['photo', 'video', 'GIF', 'audio'], dtype=object)

In [42]:
data.to_csv(os.path.join('Data',"data.csv"))

In [58]:
reactions_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24573 entries, 1 to 25552
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   content_ID     24573 non-null  object
 1   reaction_type  24573 non-null  object
dtypes: object(2)
memory usage: 575.9+ KB
