In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings 

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
youtube = pd.read_csv('youtubedata/YoutubeDataCleaned.csv', encoding = 'unicode_escape')
youtube.head()

Unnamed: 0,CITY,STATE,COUNTRY,trending_date,title,Channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed
0,Covington,KY,USA,18.25.02,Cheap Thrills - Sia / Tina Boo Choreography,1MILLION Dance Studio,24,2018-02-19T12:00:02.000Z,"choreography""|""1million dance studio""|""ìë°ë...",601159,27962,336,444,https://i.ytimg.com/vi/QRq0pkr2TWM/default.jpg,False,False,False
1,Covington,KY,USA,18.26.02,Cheap Thrills - Sia / Tina Boo Choreography,1MILLION Dance Studio,24,2018-02-19T12:00:02.000Z,"choreography""|""1million dance studio""|""ìë°ë...",627933,28580,340,444,https://i.ytimg.com/vi/QRq0pkr2TWM/default.jpg,False,False,False
2,Covington,KY,USA,18.01.03,FRIENDS - Marshmello & Anne-Marie / Tina Boo C...,1MILLION Dance Studio,24,2018-02-28T09:00:03.000Z,"choreography""|""1million dance studio""|""ìë°ë...",384249,26271,238,540,https://i.ytimg.com/vi/_xwX82Y0Oro/default.jpg,False,False,False
3,Covington,KY,USA,18.02.03,FRIENDS - Marshmello & Anne-Marie / Tina Boo C...,1MILLION Dance Studio,24,2018-02-28T09:00:03.000Z,"choreography""|""1million dance studio""|""ìë°ë...",513455,31505,279,623,https://i.ytimg.com/vi/_xwX82Y0Oro/default.jpg,False,False,False
4,Covington,KY,USA,18.03.03,FRIENDS - Marshmello & Anne-Marie / Tina Boo C...,1MILLION Dance Studio,24,2018-02-28T09:00:03.000Z,"choreography""|""1million dance studio""|""ìë°ë...",607740,35180,314,672,https://i.ytimg.com/vi/_xwX82Y0Oro/default.jpg,False,False,False


- General Idea the columns and the dataset

In [3]:
youtube.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22524 entries, 0 to 22523
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CITY                    22524 non-null  object
 1   STATE                   22524 non-null  object
 2   COUNTRY                 22524 non-null  object
 3   trending_date           22524 non-null  object
 4   title                   22524 non-null  object
 5   Channel_title           22524 non-null  object
 6   category_id             22524 non-null  int64 
 7   publish_time            22524 non-null  object
 8   tags                    22524 non-null  object
 9   views                   22524 non-null  int64 
 10  likes                   22524 non-null  int64 
 11  dislikes                22524 non-null  int64 
 12  comment_count           22524 non-null  int64 
 13  thumbnail_link          22524 non-null  object
 14  comments_disabled       22524 non-null  bool  
 15  ra

- Looking for any null values.

In [4]:
youtube.isnull().sum()

CITY                      0
STATE                     0
COUNTRY                   0
trending_date             0
title                     0
Channel_title             0
category_id               0
publish_time              0
tags                      0
views                     0
likes                     0
dislikes                  0
comment_count             0
thumbnail_link            0
comments_disabled         0
ratings_disabled          0
video_error_or_removed    0
dtype: int64

- Looking for any Duplicated values.

In [5]:
youtube.duplicated().sum()

0

- A Description about the Numerical Columns of the  data.

In [6]:
youtube.select_dtypes(exclude='object').describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
category_id,22524.0,20.15699,7.618999,1.0,17.0,24.0,25.0,43.0
views,22524.0,1329987.0,4242973.0,549.0,127517.75,376715.5,1116801.0,149376127.0
likes,22524.0,46202.51,147351.4,0.0,2547.0,10778.0,32743.75,3093544.0
dislikes,22524.0,2321.051,25954.7,0.0,115.0,380.0,1225.0,1643059.0
comment_count,22524.0,5532.497,23574.72,0.0,352.0,1190.0,3695.25,827755.0


- A Description about the Categorical Columns of the  data.

In [7]:
youtube.select_dtypes(include='object').describe().T

Unnamed: 0,count,unique,top,freq
CITY,22524,147,Atlanta,1871
STATE,22524,50,CA,2861
COUNTRY,22524,1,USA,22524
trending_date,22524,125,17.26.11,191
title,22524,4448,Maroon 5 - Wait,18
Channel_title,22524,1761,ESPN,122
publish_time,22524,4333,2018-03-07T17:00:02.000Z,25
tags,22524,4031,[none],950
thumbnail_link,22524,4389,https://i.ytimg.com/vi/H0g4JxKp4fc/default.jpg,16


- Taking a copy from the original dataset so we can edit it.

In [8]:
df = youtube.copy()

- Fixing the Time and Data columns As they are defined as Object datatype.

In [9]:
df['trending_date'] = \
     pd.to_datetime(('20' + df['trending_date']).str.replace('.',  '-'), format="%Y-%d-%m")
df['publish_time'] = \
    pd.to_datetime(df['publish_time'])

-  Drop the Unnecessary columns

In [10]:
drop_cols = ['COUNTRY', 'tags', 'thumbnail_link']
df.drop(drop_cols, axis=1, inplace=True)

- Mapping Some columns for the Analysis.

In [11]:
mapping = {True:'Yes', False:'No'}
cols = ['comments_disabled', 'ratings_disabled', 'video_error_or_removed']
for col in cols:
    df[col] = df[col].map(mapping)

In [12]:
df.to_csv('new_youtube.csv')

In [13]:
df.head()

Unnamed: 0,CITY,STATE,trending_date,title,Channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed
0,Covington,KY,2018-02-25,Cheap Thrills - Sia / Tina Boo Choreography,1MILLION Dance Studio,24,2018-02-19 12:00:02+00:00,601159,27962,336,444,No,No,No
1,Covington,KY,2018-02-26,Cheap Thrills - Sia / Tina Boo Choreography,1MILLION Dance Studio,24,2018-02-19 12:00:02+00:00,627933,28580,340,444,No,No,No
2,Covington,KY,2018-03-01,FRIENDS - Marshmello & Anne-Marie / Tina Boo C...,1MILLION Dance Studio,24,2018-02-28 09:00:03+00:00,384249,26271,238,540,No,No,No
3,Covington,KY,2018-03-02,FRIENDS - Marshmello & Anne-Marie / Tina Boo C...,1MILLION Dance Studio,24,2018-02-28 09:00:03+00:00,513455,31505,279,623,No,No,No
4,Covington,KY,2018-03-03,FRIENDS - Marshmello & Anne-Marie / Tina Boo C...,1MILLION Dance Studio,24,2018-02-28 09:00:03+00:00,607740,35180,314,672,No,No,No


In [32]:
df.likes.corr(df.dislikes)

0.44871785410548487