In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from langdetect import detect
from tqdm import tqdm

In [2]:
SCRAPPED_DATA_PATH ="./scrapped_data/"
SCRAPPED_DATA_PATH_DF = os.path.join(SCRAPPED_DATA_PATH, "scrapped_data_final.csv")

In [21]:
df = pd.read_csv(SCRAPPED_DATA_PATH_DF)

In [22]:
df.isna().sum().sum()

0

In [23]:
(df.pure_text.map(type)==float).sum()

0

In [5]:
df.columns

Index(['title', 'publication', 'link', 'author', 'followers', 'reading_time',
       'n_words', 'pure_text', 'date', 'responses', 'n_code_chunks',
       'bold_text_count', 'italic_text_count', 'mean_image_width',
       'mean_image_height', 'n_images', 'n_lists', 'n_vids', 'n_links',
       'claps'],
      dtype='object')

In [6]:
int_cols = ["followers", "reading_time", "n_words", 
            "responses", "n_code_chunks", "bold_text_count", 
            "italic_text_count", "n_images", "n_lists", 
            "n_vids", "n_links", "claps"
]

In [7]:
df[int_cols] = df[int_cols].astype(int)

In [8]:
print(df.shape)
df.sample(3)

(9935, 20)


Unnamed: 0,title,publication,link,author,followers,reading_time,n_words,pure_text,date,responses,n_code_chunks,bold_text_count,italic_text_count,mean_image_width,mean_image_height,n_images,n_lists,n_vids,n_links,claps
2550,How to Copy Edit Yourself in 7 Steps,The Writing Cooperative,https://writingcooperative.com/how-to-copy-edi...,Robert Roy Britt,155000,6,1311,"More than 30 years ago, while editor of my col...",01/04/2019,4,0,11,15,1050.0,356.0,2,0,0,6,377
4573,The magic formula to an extraordinary design r...,UX Collective,https://uxdesign.cc/the-magic-formula-to-an-ex...,Michal Ninburg,41,9,1797,"When you’re searching for a design position, e...",01/11/2019,0,0,42,7,850.0,538.0,2,2,0,7,35
1752,A 3-Step Method for Making Quicker and Better ...,Better Marketing,https://medium.com/better-marketing/how-to-mak...,Brian Pennie,13400,5,1030,“One wrong person in your circle can destroy y...,20/05/2019,7,0,1,19,1050.0,700.0,1,1,0,11,885


In [9]:
df.columns

Index(['title', 'publication', 'link', 'author', 'followers', 'reading_time',
       'n_words', 'pure_text', 'date', 'responses', 'n_code_chunks',
       'bold_text_count', 'italic_text_count', 'mean_image_width',
       'mean_image_height', 'n_images', 'n_lists', 'n_vids', 'n_links',
       'claps'],
      dtype='object')

In [10]:
num_cols = [
    "followers", "reading_time", "n_words",
    "responses", "n_code_chunks", "bold_text_count",
    "italic_text_count", "mean_image_width",
    "mean_image_height", "n_images", "n_lists",
    "n_vids", "n_links", "claps"
]

In [11]:
# sns.pairplot(df[num_cols])

In [12]:
df["year"] = pd.DatetimeIndex(df['date'], dayfirst=True).year
df["month"] = pd.DatetimeIndex(df['date'], dayfirst=True).month_name()
df["dow"] = pd.DatetimeIndex(df["date"], dayfirst=True).day_name()

In [89]:
df["dow"].value_counts()

Monday       2321
Thursday     1790
Friday       1415
Wednesday    1181
Tuesday      1129
Saturday     1071
Sunday       1028
Name: dow, dtype: int64

In [80]:
df["month"].value_counts()

May          1538
July         1101
October      1082
December      904
March         871
September     784
June          782
November      702
April         644
February      523
August        512
January       492
Name: month, dtype: int64

In [71]:
df["year"].value_counts()

2019    6822
2020     709
2021     643
2018     582
2017     420
2016     202
2015     124
2022     119
2014      78
2013      57
2012      53
2011      43
2010      28
2008      17
2009      14
2007      11
2005       9
2000       2
2006       1
2004       1
Name: year, dtype: int64

In [14]:
languages = []
for text in tqdm(df.pure_text):
    languages.append(detect(text))

100%|██████████| 9935/9935 [02:11<00:00, 75.78it/s] 


In [19]:
df.isna().sum().sum()

0

In [16]:
df["language"] = languages

In [103]:
df["language"].value_counts()

en       9752
ko         30
tr         25
es         22
th         20
ru         12
id         12
zh-tw      11
fr          9
it          8
pt          6
ja          5
zh-cn       5
vi          5
ca          5
no          2
nl          1
et          1
ro          1
de          1
pl          1
sv          1
Name: language, dtype: int64

In [104]:
df.columns

Index(['title', 'publication', 'link', 'author', 'followers', 'reading_time',
       'n_words', 'pure_text', 'date', 'responses', 'n_code_chunks',
       'bold_text_count', 'italic_text_count', 'mean_image_width',
       'mean_image_height', 'n_images', 'n_lists', 'n_vids', 'n_links',
       'claps', 'year', 'month', 'dow', 'language'],
      dtype='object')

In [18]:
season_dict = {
        "January": 'winter',
        "February": 'winter',
        "March": 'spring',
        "April": 'spring',
        "May": 'spring',
        "June": 'summer',
        "July": 'summer',
        "August": 'summer',
        "September": 'fall',
        "October": 'fall',
        "November": 'fall',
        "December": 'winter'
    }
df['season'] = df['month'].map(lambda x: season_dict[x])

In [112]:
df['season'].value_counts()

spring    3053
fall      2568
summer    2395
winter    1919
Name: season, dtype: int64

In [105]:
df["month"].unique()

array(['May', 'July', 'March', 'November', 'September', 'April',
       'October', 'December', 'June', 'February', 'August', 'January'],
      dtype=object)

In [20]:
df.to_csv("scrapped_data/scrapped_data_extended.csv", index=False)