In [1]:
import pandas as pd
from pathlib import Path

dir_13c = Path("/content/drive/MyDrive/Bangla Clickbaits/data_dump_13c")
dir_09c = Path("/content/drive/MyDrive/Bangla Clickbaits/data_dump_09c")
channel_data = pd.read_pickle("/content/drive/MyDrive/Bangla Clickbaits/channels.pkl")

dataset_13c = pd.DataFrame()
dataset_09c = pd.DataFrame()

for file in dir_13c.rglob("*.pkl"):
    data_13c = pd.read_pickle('/'.join(file.parts))
    dataset_13c = pd.concat([dataset_13c, data_13c], ignore_index=True)

for file in dir_09c.rglob("*.pkl"):
    data_09c = pd.read_pickle('/'.join(file.parts))
    dataset_09c = pd.concat([dataset_09c, data_09c], ignore_index=True)

print(dataset_13c.shape, dataset_09c.shape)

(272635, 13) (185190, 9)


In [2]:
dataset_13c.isna().sum(), dataset_09c.isna().sum()

(channel_id           0
 video_id             0
 publishedAt          0
 title                0
 description          0
 url                  0
 viewCount           40
 commentCount      1425
 likeCount        16630
 dislikeCount     30168
 favoriteCount       40
 thumbnails           0
 label                0
 dtype: int64,
 channel_id         0
 title              0
 description        0
 url                0
 liked              1
 disliked       13539
 views              0
 comment           96
 label              0
 dtype: int64)

## Numeric Missing Value Imputation

In [3]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(dataset_13c[['viewCount', 'commentCount',
                                                  'likeCount', 'dislikeCount', 'favoriteCount']])

In [4]:
dataset_13c_cp = dataset_13c.copy()

In [5]:
dataset_13c_cp.loc[:, ['viewCount', 'commentCount', 'likeCount', 'dislikeCount', 'favoriteCount']] = imputed_data

## Data Type Casting

In [6]:
dataset_13c_cp = dataset_13c_cp.astype({'viewCount': 'int',
           'commentCount': 'int',
           'likeCount': 'int',
           'dislikeCount': 'int',
           'favoriteCount': 'int'
           })

In [7]:
dataset_13c_cp.label.value_counts()

Not Clickbait    242339
Clickbait         30296
Name: label, dtype: int64

In [9]:
dataset_13c_cp.groupby(['label', 'channel_id']).count()[['video_id']]

Unnamed: 0_level_0,video_id
label,Unnamed: 1_level_1
Clickbait,30296
Not Clickbait,242339


In [45]:
dataset_13c_cp.loc[:, 'thumbnails'] = dataset_13c_cp.url.str.replace("https://www.youtube.com/watch?v=", "https://i.ytimg.com/vi/", regex=False)+"/default.jpg"

In [48]:
dataset_13c_cp.to_parquet("/content/drive/MyDrive/Bangla Clickbaits/prepared_datasets/dataset_272635r_13r_raw.parquet",
                          index=False)

In [53]:
dataset_13c_cp.head(1).to_dict(orient='records')

[{'channel_id': 'UCw4gfo5oaGPkHwarenuewAg',
  'video_id': 'J9xErXLh3bo',
  'publishedAt': '2021-08-17T08:59:13Z',
  'title': 'এইমাত্র! মসজিদে নামাজরত অবস্থায় তিন বৃদ্ধকে পিটালো যুবলীগ নেতা! ন্যাক্কারজনক ঘটনায় তোলপাড় সারাদেশ',
  'description': 'ভিডিওটি ভাল লাগলে লাইক দিন \r\nও সবাইকে দেখার সুযোগ করে দিতে Share শেয়ার করুন।\r\n\r\n▶ Thanks all for watching this video.\r\nadvanced thanks for ☞Subscribe☜ my channel.\r\n\r\n▶ Subscribe লিংক : https://goo.gl/wqxP1R\r\n\r\n\r\nNotice: If anyone use this channel video, we will Claimed You and take action as YouTube copyright law.\r\n\r\nআমাদের চ্যানেলের কোন ভিডিও অন্য কোন চ্যানেলে পাওয়া গেলে কপিরাইট ক্লেইম দেওয়া হবে ও নিয়ম অনুসারে ব্যবস্থা নেয়া হবে ।',
  'url': 'https://www.youtube.com/watch?v=J9xErXLh3bo',
  'viewCount': 12743,
  'commentCount': 45,
  'likeCount': 536,
  'dislikeCount': 35,
  'favoriteCount': 0,
  'thumbnails': 'https://i.ytimg.com/vi/J9xErXLh3bo/default.jpg',
  'label': 'Clickbait'}]