In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [3]:

posts_df = pd.read_csv('./data/posts.csv')
users_df = pd.read_csv('./data/users.csv')
views_df = pd.read_csv('./data/views.csv')

In [5]:
users_df.head()

Unnamed: 0,_id,name,gender,academics
0,5d60098a653a331687083238,Nivesh Singh Chauhan,male,undergraduate
1,5d610ae1653a331687083239,Gaurav Sharma,male,graduate
2,5d618359fc5fcf3bdd9a0910,Akshay Mishra,male,undergraduate
3,5d6d2bb87fa40e1417a49315,Saksham Mathur,male,undergraduate
4,5d7c994d5720533e15c3b1e9,Varun Chowhan,male,undergraduate


In [6]:
views_df.head()

Unnamed: 0,user_id,post_id,timestamp
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z
1,5ed3748576027d35905ccaab,5ed4cbadbd514d602c1531a6,2020-06-01T09:39:20.021Z
2,5ed0defa76027d35905cc2de,5eac305f10426255a7aa9dd3,2020-06-01T08:12:42.682Z
3,5ed0defa76027d35905cc2de,5ed1ff0276027d35905cc60d,2020-06-01T08:10:23.880Z
4,5ed0defa76027d35905cc2de,5ed3820f76027d35905ccac8,2020-06-01T08:08:54.124Z


Exploring posts data

In [7]:
posts_df.head()

Unnamed: 0,_id,title,category,post_type
0,5d62abaa65218653a132c956,hello there,Plant Biotechnology,blog
1,5d6d39567fa40e1417a4931c,Ml and AI,Artificial Intelligence|Machine Learning|Infor...,blog
2,5d7d23315720533e15c3b1ee,What is an Operating System ?,Operating Systems,blog
3,5d7d405e5720533e15c3b1f3,Lord Shiva,Drawings,artwork
4,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog


In [8]:
posts_df.describe()

Unnamed: 0,_id,title,category,post_type
count,493,493,465,493
unique,493,477,231,4
top,5d62abaa65218653a132c956,PENCIL RENDERING,Photography,artwork
freq,1,3,81,241


In [9]:
posts_df.dtypes

_id           object
title         object
category      object
 post_type    object
dtype: object

In [10]:
posts_df.isna().sum()

_id            0
title          0
category      28
 post_type     0
dtype: int64

In [16]:
#posts_df[posts_df["category"].isna()]

posts_df["category"].fillna("random",inplace=True)
posts_df.isna().sum()

_id           0
title         0
category      0
 post_type    0
dtype: int64

In [19]:
posts_df['category'].values

array(['Plant Biotechnology',
       'Artificial Intelligence|Machine Learning|Information Technology',
       'Operating Systems', 'Drawings', 'Competition Laws',
       'Competition Laws', 'Competition Laws', 'Competition Laws',
       'Eco System', 'Economic Policies', 'Graphic|Graphic Design',
       'Drawings', 'Drawings', 'Painting', 'Pen and ink', 'Drawings',
       'Computer Technology|Information Technology', 'Drawings|Painting',
       'Competition Laws',
       'Graphic Design|Visual Arts|Illustration|Graphic',
       'Drawings|Calligraphy', 'Photography', 'Empowerment',
       'Photography', 'random', 'random', 'Drawings', 'Video editing',
       'Inorganic Chemistry', 'random', 'Drawings',
       'Programming languages', 'Conceptual|Graphic Design',
       'HR Management', 'Drawings', 'Human Resources|HR Management',
       'Mass Media|International Relations', 'Sculptures|Artistic design',
       'Fashion Design|Ceramics|Artistic design', 'Craft|Artistic design',
       '

In [22]:
cat_dict = {}

for cat in posts_df['category']:
    cat_dict.update({cat:[]})
    for sub_cat in cat.split('|'):
        cat_dict[cat].append(sub_cat)

cat_dict

{'Plant Biotechnology': ['Plant Biotechnology'],
 'Artificial Intelligence|Machine Learning|Information Technology': ['Artificial Intelligence',
  'Machine Learning',
  'Information Technology'],
 'Operating Systems': ['Operating Systems'],
 'Drawings': ['Drawings'],
 'Competition Laws': ['Competition Laws'],
 'Eco System': ['Eco System'],
 'Economic Policies': ['Economic Policies'],
 'Graphic|Graphic Design': ['Graphic', 'Graphic Design'],
 'Painting': ['Painting'],
 'Pen and ink': ['Pen and ink'],
 'Computer Technology|Information Technology': ['Computer Technology',
  'Information Technology'],
 'Drawings|Painting': ['Drawings', 'Painting'],
 'Graphic Design|Visual Arts|Illustration|Graphic': ['Graphic Design',
  'Visual Arts',
  'Illustration',
  'Graphic'],
 'Drawings|Calligraphy': ['Drawings', 'Calligraphy'],
 'Photography': ['Photography'],
 'Empowerment': ['Empowerment'],
 'random': ['random'],
 'Video editing': ['Video editing'],
 'Inorganic Chemistry': ['Inorganic Chemistry']

In [50]:
keys = posts_df.columns
updated_posts_dict = {key:[] for key in keys}


for cat in posts_df['category']:

    id = posts_df[posts_df['category']==cat]['_id'].values[0]
    title = posts_df[posts_df['category']==cat]['title'].values[0]
    post_type = posts_df[posts_df['category']==cat][' post_type'].values[0]
    for sub_cat in cat_dict[cat]:
        updated_posts_dict['_id'].append(id)
        updated_posts_dict['title'].append(title)
        updated_posts_dict['category'].append(sub_cat)
        updated_posts_dict[' post_type'].append(post_type)

In [52]:
updated_posts_df = pd.DataFrame(updated_posts_dict)
updated_posts_df.head(15)

Unnamed: 0,_id,title,category,post_type
0,5d62abaa65218653a132c956,hello there,Plant Biotechnology,blog
1,5d6d39567fa40e1417a4931c,Ml and AI,Artificial Intelligence,blog
2,5d6d39567fa40e1417a4931c,Ml and AI,Machine Learning,blog
3,5d6d39567fa40e1417a4931c,Ml and AI,Information Technology,blog
4,5d7d23315720533e15c3b1ee,What is an Operating System ?,Operating Systems,blog
5,5d7d405e5720533e15c3b1f3,Lord Shiva,Drawings,artwork
6,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog
7,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog
8,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog
9,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog


In [53]:
updated_posts_df.describe()

Unnamed: 0,_id,title,category,post_type
count,930,930,930,930
unique,232,228,235,4
top,5dc065ca24b883670268772f,Colours of pushkar.,Photography,blog
freq,81,81,96,481


(930, 4)