In [1]:
import pandas as pd

### Published Content

In [2]:
#import published content csv
content_path = './assets/datasets/content_published_scrubbed.csv'
pub_content = pd.read_csv(content_path)

In [3]:
pub_content.head()

Unnamed: 0,action,content_type_name,parent_content_type_name,project_id,user_id
0,content_published,post,brainstorm,91,51323
1,content_published,idea,brainstorm,91,74912
2,content_published,idea,brainstorm,91,63773
3,content_published,idea,brainstorm,91,75755
4,content_published,idea,brainstorm,91,75755


In [4]:
pub_content.shape

(30325, 5)

In [5]:
pub_content.loc[:, 'content_type_name'].value_counts()

post     26892
entry     2817
idea       616
Name: content_type_name, dtype: int64

In [6]:
pub_content.loc[:, 'content_published'] = pub_content.loc[:, 'action'].map({'content_published': 1})
pub_content.head()

Unnamed: 0,action,content_type_name,parent_content_type_name,project_id,user_id,content_published
0,content_published,post,brainstorm,91,51323,1.0
1,content_published,idea,brainstorm,91,74912,1.0
2,content_published,idea,brainstorm,91,63773,1.0
3,content_published,idea,brainstorm,91,75755,1.0
4,content_published,idea,brainstorm,91,75755,1.0


In [7]:
pub_content.dropna(inplace=True)
pub_content.head()

Unnamed: 0,action,content_type_name,parent_content_type_name,project_id,user_id,content_published
0,content_published,post,brainstorm,91,51323,1.0
1,content_published,idea,brainstorm,91,74912,1.0
2,content_published,idea,brainstorm,91,63773,1.0
3,content_published,idea,brainstorm,91,75755,1.0
4,content_published,idea,brainstorm,91,75755,1.0


In [8]:
pub_content.loc[:, 'content_published'] = pub_content.loc[:, 'content_published'].astype('int')
pub_content.head()

Unnamed: 0,action,content_type_name,parent_content_type_name,project_id,user_id,content_published
0,content_published,post,brainstorm,91,51323,1
1,content_published,idea,brainstorm,91,74912,1
2,content_published,idea,brainstorm,91,63773,1
3,content_published,idea,brainstorm,91,75755,1
4,content_published,idea,brainstorm,91,75755,1


In [9]:
pub_content.drop(columns=['action'], inplace=True)

In [10]:
pub_content.head()

Unnamed: 0,content_type_name,parent_content_type_name,project_id,user_id,content_published
0,post,brainstorm,91,51323,1
1,idea,brainstorm,91,74912,1
2,idea,brainstorm,91,63773,1
3,idea,brainstorm,91,75755,1
4,idea,brainstorm,91,75755,1


In [11]:
pub_content = pd.get_dummies(pub_content, columns=['content_type_name'])

In [12]:
pub_content = pub_content.drop(columns=['content_type_name_idea'])

In [13]:
pub_content.head()

Unnamed: 0,parent_content_type_name,project_id,user_id,content_published,content_type_name_entry,content_type_name_post
0,brainstorm,91,51323,1,0,1
1,brainstorm,91,74912,1,0,0
2,brainstorm,91,63773,1,0,0
3,brainstorm,91,75755,1,0,0
4,brainstorm,91,75755,1,0,0


### Dictionary:
 - content_type_name_entry = user added idea on a project challenge
 - content_type_name_post = user added post to the content type shown in parent_content_type_name
 - if content_published = 1 but it's not content_type_name_entry or content_type_name_post then it is content_type_name_idea
    

### Watches

In [14]:
#import watch csv
watch_path = './assets/datasets/watch_scrubbed.csv'
watches = pd.read_csv(watch_path)

In [15]:
watches.head()

Unnamed: 0,watch,content_type_name,parent_content_type_name,project_id,user_id
0,yes,project,project,166.0,1
1,yes,project,project,54.0,1
2,yes,project,project,210.0,75129
3,yes,project,project,120.0,1
4,yes,project,project,2.0,1


In [16]:
#Checking for different value types of the `content_type_name` column. Is it all projects?

In [17]:
watches.loc[:, 'content_type_name'].value_counts()

project    6004
Name: content_type_name, dtype: int64

In [18]:
#Getting a general shape of this dataset

In [19]:
watches.shape

(6004, 5)

In [20]:
#I'm seeing a lot of 1's as a user_id which is not a real ID and tells me that these were imputed 
#for any null values prior to me receiving the datasets. With this being the case I need to get rid of these rows.

In [21]:
watches.loc[:, 'user_id'].nunique()

4157

In [22]:
one_user = watches.loc[:, 'user_id'] == 1
watches = watches[watches.loc[:, 'user_id'] != 1]
watches.head()

Unnamed: 0,watch,content_type_name,parent_content_type_name,project_id,user_id
2,yes,project,project,210.0,75129
13,yes,project,project,211.0,75129
14,yes,project,project,211.0,75129
19,yes,project,project,209.0,75129
22,yes,project,project,208.0,75129


In [23]:
#Checking for null values, since it's only 12 in the project_id's column I feel comfortable just dropping the rows.

In [24]:
watches.isnull().sum()

watch                        0
content_type_name            0
parent_content_type_name     0
project_id                  12
user_id                      0
dtype: int64

In [25]:
watches = watches.dropna()

In [26]:
#The project id looks like a float. This should be an int.

In [27]:
watches.dtypes

watch                        object
content_type_name            object
parent_content_type_name     object
project_id                  float64
user_id                       int64
dtype: object

In [28]:
watches.loc[:, 'project_id'] = watches.loc[:, 'project_id'].astype(int)
watches.dtypes

watch                       object
content_type_name           object
parent_content_type_name    object
project_id                   int64
user_id                      int64
dtype: object

In [29]:
#Since I will be merging this with other datasets. `watches` should be a binary variable so that any rows that
#do were not "watched" can be set as 0 rather than being NA

In [30]:
watches.loc[:, 'watch'] = watches.loc[:, 'watch'].map({'yes': 1})

In [31]:
watches.head()

Unnamed: 0,watch,content_type_name,parent_content_type_name,project_id,user_id
2,1,project,project,210,75129
13,1,project,project,211,75129
14,1,project,project,211,75129
19,1,project,project,209,75129
22,1,project,project,208,75129


In [32]:
#Since I used pd.get_dummies on the `content_type_name` column in the pub_content dataset to turn those values 
#into binary values the `content_type_name` column on this dataset should be binary as well.

In [33]:
watches.rename(columns = {'content_type_name': 'content_type_name_project'}, inplace=True)

In [34]:
watches.head()

Unnamed: 0,watch,content_type_name_project,parent_content_type_name,project_id,user_id
2,1,project,project,210,75129
13,1,project,project,211,75129
14,1,project,project,211,75129
19,1,project,project,209,75129
22,1,project,project,208,75129


In [35]:
watches.loc[:, 'content_type_name_project'] = watches.loc[:, 'content_type_name_project'].map({'project': 1})
watches.loc[:, 'content_type_name_project'].fillna(1, inplace=True)
watches.loc[:, 'content_type_name_project'] = watches.loc[:, 'content_type_name_project'].astype(int)
watches.head()

Unnamed: 0,watch,content_type_name_project,parent_content_type_name,project_id,user_id
2,1,1,project,210,75129
13,1,1,project,211,75129
14,1,1,project,211,75129
19,1,1,project,209,75129
22,1,1,project,208,75129


### Entry Votes

In [36]:
#import entry vote csv
vote_path = './assets/datasets/entry_vote_scrubbed.csv'
entry_votes = pd.read_csv(vote_path)

In [37]:
entry_votes.head()

Unnamed: 0,vote,content_type_name,project_id,parent_content_type_name,user_id
0,yes,entry,268,challenge,75084
1,yes,entry,268,challenge,75084
2,yes,entry,268,challenge,75084
3,yes,entry,268,challenge,75084
4,yes,entry,268,challenge,75084


In [38]:
entry_votes.dtypes

vote                        object
content_type_name           object
project_id                   int64
parent_content_type_name    object
user_id                      int64
dtype: object

In [39]:
#Checking for any unique values in the `content_type_name` column. It looks like all values are "entry"
#Within the pub_content dataframe there is already a column and value that represents this so I will match
#this dataframe to pub_content for consistency and easy merging.

In [40]:
entry_votes.loc[:, 'content_type_name'].value_counts()

entry    37893
Name: content_type_name, dtype: int64

In [41]:
entry_votes.shape

(37893, 5)

In [42]:
entry_votes['content_type_name'] = entry_votes.loc[:, 'content_type_name'].map({'entry': 1})
entry_votes.loc[:, 'content_type_name'].fillna(1, inplace=True)
entry_votes.loc[:, 'content_type_name'] = entry_votes.loc[:, 'content_type_name'].astype(int, inplace=True)
entry_votes.rename(columns={'content_type_name': 'content_type_name_entry'}, inplace=True)

In [43]:
#Similar to how I have done with other 'action' value columns like `watch`, vote's needs to be turned into a
#binary column for easier computing

In [44]:
entry_votes.head()

Unnamed: 0,vote,content_type_name_entry,project_id,parent_content_type_name,user_id
0,yes,1,268,challenge,75084
1,yes,1,268,challenge,75084
2,yes,1,268,challenge,75084
3,yes,1,268,challenge,75084
4,yes,1,268,challenge,75084


In [45]:
entry_votes['vote'] = entry_votes.loc[:, 'vote'].map({'yes': 1})
entry_votes['vote'] = entry_votes['vote'].fillna(1)
entry_votes['vote'] = entry_votes['vote'].astype(int)
entry_votes.head()

Unnamed: 0,vote,content_type_name_entry,project_id,parent_content_type_name,user_id
0,1,1,268,challenge,75084
1,1,1,268,challenge,75084
2,1,1,268,challenge,75084
3,1,1,268,challenge,75084
4,1,1,268,challenge,75084


In [46]:
entry_votes.loc[:, 'vote'].unique()

array([1])

### Projects

In [47]:
#import project csv
project_path = './assets/datasets/projects_scrubbed.csv'
projects = pd.read_csv(project_path)

In [48]:
projects.head()

Unnamed: 0,content_type_name,parent_content_type_name,post_count,topic_count,view_count,watch_count,participants,title,project_id,categories
0,project,project,1179,19,15769,713,755,#AccessibleOlli,266,Ground Mobility
1,project,project,10980,48,5367,359,376,Airbus Cargo Drone Build,166,Air Mobility
2,project,project,116,8,1809,357,368,Axion,250,Consumer Products
3,project,project,116,8,1809,357,368,Axion,250,Ground Mobility
4,project,project,116,8,1809,357,368,Axion,250,Infrastructure and Energy


In [49]:
#Checking general values of the dataset to determine what items need to be changed or imputed to be able to merge 
#with other datasets

In [50]:
projects.loc[:, 'content_type_name'].value_counts()

project    41
Name: content_type_name, dtype: int64

In [51]:
projects.shape

(41, 10)

In [52]:
projects.isnull().sum()

content_type_name           0
parent_content_type_name    0
post_count                  0
topic_count                 0
view_count                  0
watch_count                 0
participants                0
title                       0
project_id                  0
categories                  0
dtype: int64

In [53]:
#Data looks pretty clean, the only thing to do in here looks to be mapping the `content_type_name` column 

In [54]:
projects['content_type_name_project'] = projects.loc[:, 'content_type_name'].map({'project': 1})
projects.drop(axis='columns', columns = ['content_type_name'])

Unnamed: 0,parent_content_type_name,post_count,topic_count,view_count,watch_count,participants,title,project_id,categories,content_type_name_project
0,project,1179,19,15769,713,755,#AccessibleOlli,266,Ground Mobility,1
1,project,10980,48,5367,359,376,Airbus Cargo Drone Build,166,Air Mobility,1
2,project,116,8,1809,357,368,Axion,250,Consumer Products,1
3,project,116,8,1809,357,368,Axion,250,Ground Mobility,1
4,project,116,8,1809,357,368,Axion,250,Infrastructure and Energy,1
5,project,111,5,1935,72,78,Camilo Pardo 3E concept,143,Consumer Products,1
6,project,111,5,1935,72,78,Camilo Pardo 3E concept,143,Ground Mobility,1
7,project,32,3,559,175,181,Detecting Corrosion Under Insulation,264,Industrial Inspection and Monitoring,1
8,project,88,12,1283,116,122,Detecting Weld Seams,211,Industrial Inspection and Monitoring,1
9,project,753,5,2474,2562,2632,Explore with Us: The Future of Mobility,322,Air Mobility,1


### Users

In [55]:
#import users csv
users_path = './assets/datasets/users_scrubbed.csv'
users = pd.read_csv(users_path)

In [56]:
users.head()

Unnamed: 0,username,user_id
0,Julorsa,85423
1,sipun,87192
2,donandtheresa,75720
3,deadbard,91022
4,gourav1279verma,89105


In [57]:
#Going through the basic exploratory data analysis for this dataset, looks like everything is set and clean

In [58]:
users.shape

(194152, 2)

In [59]:
users.isnull().sum()

username    0
user_id     0
dtype: int64

In [60]:
users.dtypes

username    object
user_id      int64
dtype: object

### Follows

In [61]:
#import follows csv
follows_path = './assets/datasets/signup_follow_projects_success_scrubbed_split.csv'
follows = pd.read_csv(follows_path)

In [62]:
#Going through the basic exploratory data analysis for this dataset, looks like there us an extra column that 
#needs to be removed.

In [63]:
follows.head()

Unnamed: 0.1,Unnamed: 0,action,project_ids,user_id
0,0,follow,266,83093
1,1,follow,208,83093
2,2,follow,266,83094
3,3,follow,264,83094
4,4,follow,250,83095


In [64]:
follows.shape

(102101, 4)

In [65]:
follows.isnull().sum()

Unnamed: 0     0
action         0
project_ids    0
user_id        0
dtype: int64

In [66]:
follows.dtypes

Unnamed: 0      int64
action         object
project_ids     int64
user_id         int64
dtype: object

In [67]:
follows = follows.drop(columns=['Unnamed: 0'])

In [68]:
follows.head()

Unnamed: 0,action,project_ids,user_id
0,follow,266,83093
1,follow,208,83093
2,follow,266,83094
3,follow,264,83094
4,follow,250,83095


In [69]:
#The `action` column needs to be mapped like `watch` and `vote` were.

In [70]:
follows['follow'] = follows.loc[:, 'action'].map({'follow': 1})

In [71]:
follows.drop(axis='columns', columns=['action'], inplace=True)

In [72]:
#Rename `project_ids` to `project_id` to match other dataframes

In [73]:
follows.rename(columns={'project_ids': 'project_id'}, inplace=True)

In [74]:
follows.head()

Unnamed: 0,project_id,user_id,follow
0,266,83093,1
1,208,83093,1
2,266,83094,1
3,264,83094,1
4,250,83095,1


## Merging Dataframes

In [75]:
#Merge pub_content and projects on `project_id`

In [76]:
pub_content.head()

Unnamed: 0,parent_content_type_name,project_id,user_id,content_published,content_type_name_entry,content_type_name_post
0,brainstorm,91,51323,1,0,1
1,brainstorm,91,74912,1,0,0
2,brainstorm,91,63773,1,0,0
3,brainstorm,91,75755,1,0,0
4,brainstorm,91,75755,1,0,0


In [77]:
projects.head()

Unnamed: 0,content_type_name,parent_content_type_name,post_count,topic_count,view_count,watch_count,participants,title,project_id,categories,content_type_name_project
0,project,project,1179,19,15769,713,755,#AccessibleOlli,266,Ground Mobility,1
1,project,project,10980,48,5367,359,376,Airbus Cargo Drone Build,166,Air Mobility,1
2,project,project,116,8,1809,357,368,Axion,250,Consumer Products,1
3,project,project,116,8,1809,357,368,Axion,250,Ground Mobility,1
4,project,project,116,8,1809,357,368,Axion,250,Infrastructure and Energy,1


In [78]:
projects.loc[:, 'categories'].value_counts()

Ground Mobility                         9
Industrial Inspection and Monitoring    8
Mars                                    6
Air Mobility                            5
Consumer Products                       4
Adaptive Sports                         3
 Air Mobility                           2
Infrastructure and Energy               2
Architecture and Construction           1
Health and Wellness                     1
Name: categories, dtype: int64

In [79]:
#It looks like I have a ` Air Mobility` and `Air Mobility` value in my `categories` column, which will need to be 
#fixed before I can merge these two DFs

In [80]:
spaced_air_mobility = projects.loc[:, 'categories'] == ' Air Mobility'

In [81]:
projects.loc[spaced_air_mobility, 'categories']

9      Air Mobility
40     Air Mobility
Name: categories, dtype: object

In [82]:
projects.replace({'categories': 9, 'categories': 40}, 'Air Mobility', inplace=True)

In [83]:
projects.loc[:, 'categories'].value_counts()

Ground Mobility                         9
Industrial Inspection and Monitoring    8
Mars                                    6
Air Mobility                            5
Consumer Products                       4
Adaptive Sports                         3
 Air Mobility                           2
Infrastructure and Energy               2
Architecture and Construction           1
Health and Wellness                     1
Name: categories, dtype: int64

In [84]:
#Merging `pub_content` and `projects` on project_id
project_pub = pd.merge(pub_content, projects, how='outer', on='project_id')

In [85]:
#There are a lot of null columns but all null values in follows will be filled with 0's and the project_ids should 
#fill out more as I merge in the other dataframes.
project_pub.isnull().sum()

parent_content_type_name_x      1
project_id                      0
user_id                         1
content_published               1
content_type_name_entry         1
content_type_name_post          1
content_type_name             373
parent_content_type_name_y    373
post_count                    373
topic_count                   373
view_count                    373
watch_count                   373
participants                  373
title                         373
categories                    373
content_type_name_project     373
dtype: int64

In [159]:
project_pub.head()

Unnamed: 0,parent_content_type_name_x,project_id,user_id,content_published,content_type_name_entry,content_type_name_post,content_type_name,parent_content_type_name_y,post_count,topic_count,view_count,watch_count,participants,title,categories,content_type_name_project
61,brainstorm,155,1439.0,1.0,0.0,0.0,project,project,1586.0,27.0,6116.0,482.0,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0
62,brainstorm,155,75774.0,1.0,0.0,0.0,project,project,1586.0,27.0,6116.0,482.0,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0
63,brainstorm,155,10.0,1.0,0.0,0.0,project,project,1586.0,27.0,6116.0,482.0,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0
64,brainstorm,155,1439.0,1.0,0.0,0.0,project,project,1586.0,27.0,6116.0,482.0,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0
65,brainstorm,155,1439.0,1.0,0.0,0.0,project,project,1586.0,27.0,6116.0,482.0,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0


In [87]:
project_pub.shape

(34811, 16)

In [88]:
#It looks like we have projects that do not have a category or title. 

In [89]:
nulls = project_pub.loc[:, 'categories'].isnull()
null_project_ids = project_pub.loc[nulls, 'project_id']
print(null_project_ids.unique())
print(null_project_ids.nunique())

[ 91 122 126 157   6   2  83 323 296  69   4  52  72 304 295  54  63  56
   5  99 256 252   3]
23


In [90]:
project_pub.loc[:, 'categories'].value_counts()

Mars                                    13657
Ground Mobility                          9966
 Air Mobility                            3786
Adaptive Sports                          2824
Industrial Inspection and Monitoring     1891
Consumer Products                         697
Air Mobility                              612
Infrastructure and Energy                 413
Architecture and Construction             406
Health and Wellness                       186
Name: categories, dtype: int64

In [91]:
proj_id = project_pub.loc[:, 'project_id'] == 91
project_pub.loc[proj_id, :]

Unnamed: 0,parent_content_type_name_x,project_id,user_id,content_published,content_type_name_entry,content_type_name_post,content_type_name,parent_content_type_name_y,post_count,topic_count,view_count,watch_count,participants,title,categories,content_type_name_project
0,brainstorm,91,51323.0,1.0,0.0,1.0,,,,,,,,,,
1,brainstorm,91,74912.0,1.0,0.0,0.0,,,,,,,,,,
2,brainstorm,91,63773.0,1.0,0.0,0.0,,,,,,,,,,
3,brainstorm,91,75755.0,1.0,0.0,0.0,,,,,,,,,,
4,brainstorm,91,75755.0,1.0,0.0,0.0,,,,,,,,,,
5,brainstorm,91,75755.0,1.0,0.0,0.0,,,,,,,,,,
6,brainstorm,91,81892.0,1.0,0.0,0.0,,,,,,,,,,
7,brainstorm,91,75986.0,1.0,0.0,0.0,,,,,,,,,,
8,brainstorm,91,75950.0,1.0,0.0,0.0,,,,,,,,,,
9,brainstorm,91,75950.0,1.0,0.0,0.0,,,,,,,,,,


In [92]:
#After reviewing these projects our API (this could not be pulled into the notebook because of sensitive information)
#I found that all of these projects were created and oublished prior to the 9/2016 cutoff date and therefore
#these values should be dropped and were purposefully left out to prevent bias

In [93]:
project_pub.dropna(inplace=True)
project_pub.isnull().sum()

parent_content_type_name_x    0
project_id                    0
user_id                       0
content_published             0
content_type_name_entry       0
content_type_name_post        0
content_type_name             0
parent_content_type_name_y    0
post_count                    0
topic_count                   0
view_count                    0
watch_count                   0
participants                  0
title                         0
categories                    0
content_type_name_project     0
dtype: int64

In [95]:
#Merge project_pub and follows on `project_id` and `user_id`

In [96]:
follows.head()

Unnamed: 0,project_id,user_id,follow
0,266,83093,1
1,208,83093,1
2,266,83094,1
3,264,83094,1
4,250,83095,1


In [110]:
proj_pub_follow = pd.merge(project_pub, follows, how='left', on=['project_id', 'user_id'])

In [111]:
proj_pub_follow.shape

(36038, 17)

In [112]:
proj_pub_follow.isnull().sum()

parent_content_type_name_x        0
project_id                        0
user_id                           0
content_published                 0
content_type_name_entry           0
content_type_name_post            0
content_type_name                 0
parent_content_type_name_y        0
post_count                        0
topic_count                       0
view_count                        0
watch_count                       0
participants                      0
title                             0
categories                        0
content_type_name_project         0
follow                        28244
dtype: int64

In [113]:
#Merge proj_pub_follow and wacthes on `project_id` and `user_id`

In [114]:
watches.head()

Unnamed: 0,watch,content_type_name_project,parent_content_type_name,project_id,user_id
2,1,1,project,210,75129
13,1,1,project,211,75129
14,1,1,project,211,75129
19,1,1,project,209,75129
22,1,1,project,208,75129


In [118]:
proj_pub_fol_watch = pd.merge(proj_pub_follow, watches, how='left', on=['project_id', 'user_id'])

In [119]:
proj_pub_fol_watch.shape

(36790, 20)

In [120]:
proj_pub_fol_watch.isnull().sum()

parent_content_type_name_x         0
project_id                         0
user_id                            0
content_published                  0
content_type_name_entry            0
content_type_name_post             0
content_type_name                  0
parent_content_type_name_y         0
post_count                         0
topic_count                        0
view_count                         0
watch_count                        0
participants                       0
title                              0
categories                         0
content_type_name_project_x        0
follow                         28990
watch                          28478
content_type_name_project_y    28478
parent_content_type_name       28478
dtype: int64

In [189]:
proj_pub_fol_watch.rename(columns = {'content_type_name_project_x': 'content_type_name_project'}, inplace = True)

In [121]:
#Merge proj_pub_fol_watch and entry_votes on `project_id` and `user_id`

In [122]:
entry_votes.head()

Unnamed: 0,vote,content_type_name_entry,project_id,parent_content_type_name,user_id
0,1,1,268,challenge,75084
1,1,1,268,challenge,75084
2,1,1,268,challenge,75084
3,1,1,268,challenge,75084
4,1,1,268,challenge,75084


In [191]:
df = pd.merge(proj_pub_fol_watch, entry_votes, how='left', on=['project_id', 'user_id', 'parent_content_type_name'])

In [192]:
df.isnull().sum()

parent_content_type_name_x         0
project_id                         0
user_id                            0
content_published                  0
content_type_name_entry_x          0
content_type_name_post             0
content_type_name                  0
parent_content_type_name_y         0
post_count                         0
topic_count                        0
view_count                         0
watch_count                        0
participants                       0
title                              0
categories                         0
content_type_name_project          0
follow                         28990
watch                          28478
content_type_name_project_y    28478
parent_content_type_name       28478
vote                           36790
content_type_name_entry_y      36790
dtype: int64

In [193]:
df.shape

(36790, 22)

## Cleaning Final DF

In [194]:
df.head()

Unnamed: 0,parent_content_type_name_x,project_id,user_id,content_published,content_type_name_entry_x,content_type_name_post,content_type_name,parent_content_type_name_y,post_count,topic_count,...,participants,title,categories,content_type_name_project,follow,watch,content_type_name_project_y,parent_content_type_name,vote,content_type_name_entry_y
0,brainstorm,155,1439,1.0,0.0,0.0,project,project,1586.0,27.0,...,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0,,,,,,
1,brainstorm,155,75774,1.0,0.0,0.0,project,project,1586.0,27.0,...,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0,,,,,,
2,brainstorm,155,10,1.0,0.0,0.0,project,project,1586.0,27.0,...,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0,,,,,,
3,brainstorm,155,1439,1.0,0.0,0.0,project,project,1586.0,27.0,...,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0,,,,,,
4,brainstorm,155,1439,1.0,0.0,0.0,project,project,1586.0,27.0,...,503.0,"Olli: self-driving, cognitive electric shuttle",Ground Mobility,1.0,,,,,,


In [195]:
#The follow column is binary, the NaN values will be the 0 for "no follows"
df['follow'] = df.follow.fillna(0, axis=0)

In [196]:
df['follow'].value_counts()

0.0    28990
1.0     7800
Name: follow, dtype: int64

In [197]:
#The vote column is binary, the NaN values will be the 0 for "no follows"
df['vote'] = df.vote.fillna(0, axis=0)

In [198]:
df['vote'].value_counts()

0.0    36790
Name: vote, dtype: int64

In [199]:
#The watch column is binary, the NaN values will be the 0 for "no follows"
df['watch'] = df.watch.fillna(0, axis=0)

In [200]:
df['watch'].value_counts()

0.0    28478
1.0     8312
Name: watch, dtype: int64

In [201]:
df.isnull().sum()

parent_content_type_name_x         0
project_id                         0
user_id                            0
content_published                  0
content_type_name_entry_x          0
content_type_name_post             0
content_type_name                  0
parent_content_type_name_y         0
post_count                         0
topic_count                        0
view_count                         0
watch_count                        0
participants                       0
title                              0
categories                         0
content_type_name_project          0
follow                             0
watch                              0
content_type_name_project_y    28478
parent_content_type_name       28478
vote                               0
content_type_name_entry_y      36790
dtype: int64

In [182]:
false_content = df.loc[:, 'parent_content_type_name_y'].isnull() == False