In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import dash
import plotly.express as px

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

DATASET_PATH = os.getenv("DATASET_PATH")

In [8]:
dataset = pd.read_csv("dataset.csv")
dataset.sample(5)

Unnamed: 0,publication_url,author,date,reading_time,post_url,title,preview_image_url,claps
198,https://python.plainenglish.io/,Liu Zuo Lin,2022-10-15,3 min,https://python.plainenglish.io/how-to-run-back...,How To Run Background Tasks In FastAPI (Python),https://cdn-images-1.medium.com/fit/t/1600/480...,101
55,https://medium.com/swlh,Zulie Rane,2022-10-25,6 min,https://medium.com/swlh/4-ways-ive-quietly-mad...,"4 Ways I’ve Quietly Made $1,000+",https://cdn-images-1.medium.com/fit/t/1600/480...,11.3K
134,https://levelup.gitconnected.com/,Jacob Ferus,2022-12-01,4 min,https://levelup.gitconnected.com/chatgpt-is-mi...,ChatGPT Is Mind-Blowing — Everything You Need ...,https://cdn-images-1.medium.com/fit/t/1600/480...,1K
182,https://python.plainenglish.io/,Edoardo Bianchi,2022-11-15,10 min,https://python.plainenglish.io/i-fine-tuned-gp...,I Fine-Tuned GPT-2 on 100K Scientific Papers. ...,https://cdn-images-1.medium.com/fit/t/1600/480...,674
76,https://humanparts.medium.com/,Jacqueline Dooley,2022-12-14,5 min,https://humanparts.medium.com/2022-in-one-word...,2022 in One Word: Menopause,https://cdn-images-1.medium.com/fit/t/1600/480...,2.2K


In [47]:
# preprocessing
df = dataset.copy()

# convert reading_time to integer by removing the `min` part
df["reading_time"] = df["reading_time"].apply(lambda entry: int(entry.split()[0]))
# convert claps to float by removing the `K`s and multiply the value by 1000
df["claps"] = df["claps"].apply(lambda x: float(x) if not "K" in x else float(x[:-1]) * 1000)
# map the date to datetime object and split the date to 3 columns
df["date"] = pd.to_datetime(df["date"])
df["month"] = df["date"].apply(lambda x: x.month)
df["day"] = df["date"].apply(lambda x: x.day)
df["year"] = df["date"].apply(lambda x: x.year)

df.sample(5)

Unnamed: 0,publication_url,author,date,reading_time,post_url,title,preview_image_url,claps,month,day,year
188,https://python.plainenglish.io/,Haider Imtiaz,2022-11-10,8,https://python.plainenglish.io/10-python-scrip...,10 Python Scripts for Daily Automation,https://cdn-images-1.medium.com/fit/t/1600/480...,37.0,11,10,2022
116,https://medium.com/geekculture,Kostas Farkonas,2022-10-05,7,https://medium.com/geekculture/samsungs-new-sm...,Samsung’s new smartphone battery issue is pure...,https://cdn-images-1.medium.com/fit/t/1600/480...,237.0,10,5,2022
85,https://humanparts.medium.com/,Anton Kutselyk,2022-10-29,5,https://humanparts.medium.com/i-just-need-a-sl...,The anti-routine of a slow Saturday,https://cdn-images-1.medium.com/max/360/1*EGtu...,732.0,10,29,2022
166,https://levelup.gitconnected.com/,Imran Farooq,2022-09-27,2,https://levelup.gitconnected.com/react-best-pr...,React Best Practices for Software Design and A...,https://cdn-images-1.medium.com/fit/t/1600/480...,798.0,9,27,2022
206,https://python.plainenglish.io/,Liu Zuo Lin,2022-09-06,4,https://python.plainenglish.io/10-levels-of-wr...,10 Levels of Writing Python Functions,https://cdn-images-1.medium.com/fit/t/1600/480...,414.0,9,6,2022


In [74]:
processed_df = df.copy()
processed_df.drop(columns=["post_url", "preview_image_url", "month", "day", "year"], inplace=True)
processed_df.to_csv("processed_dataset.csv", index=False)

In [66]:
px.scatter(df[df["author"] == "Tim Denning"], x="date", y="claps", color="publication_url")

## Required definitions for the analysis:

- Success: Moving average of the claps the writer got every month.
- Popularity: Average claps per article over month for the author/publication

## Questions related to the analysis:

### Questions about the authors:
- [x]  Does authors write for more than 1 publications?
- [x]  Which author is the most popular?
- [x]  Which author is the most successful one?
- [x]  Average reading time of the articles written by the successful writers
- [ ]  What kind of stories does the successful writers write?
- [x]  How many articles do successful writers write per month?

Improvs:

- What's the article about? (categorization)
- What are the average reading time for successful categories.

### Questions about the publications:
- [x]  Which publications are more popular?
- [x]  What are the average reading times per publication?
- [x]  How many different authors write for the publication?
- [x]  How many articles are published in average per month?
- [ ]  What are the category of popular publications?


Improv:
- All questions can be generalized by the change over time while plotting the graphs. Time series can be used.
- What are the popular sharing hours for the articles? (need the publication time with the hour the article published)

In [10]:
# start from the question about the publications:
# popularity: average claps per article over month for the author/publication
# success: moving average of the claps the writer got every month

# which publications are more popular?

total_claps = df.groupby("publication_url")["claps"].sum()
total_articles = df["publication_url"].value_counts()
avg_claps_per_article = total_claps / total_articles

avg_claps_per_article.sort_values(ascending=False)

https://medium.com/swlh              3443.600
https://entrepreneurshandbook.co/    1682.025
https://levelup.gitconnected.com/    1387.825
https://humanparts.medium.com/       1210.250
https://medium.com/geekculture        788.200
https://towardsdatascience.com/       694.000
https://python.plainenglish.io/       271.575
dtype: float64

In [11]:
# How many different authors write for the publication?
df.groupby("publication_url")["author"].unique().apply(len).sort_values(ascending=False)

publication_url
https://levelup.gitconnected.com/    28
https://towardsdatascience.com/      24
https://medium.com/geekculture       22
https://humanparts.medium.com/       15
https://python.plainenglish.io/      15
https://entrepreneurshandbook.co/    12
https://medium.com/swlh              10
Name: author, dtype: int64

In [12]:
# How many articles are published in average per month?
pd.DataFrame(df.groupby("publication_url")["month"].value_counts()).reset_index(level=0).groupby("publication_url").mean().sort_values(
    by="month", ascending=False
)

Unnamed: 0_level_0,month
publication_url,Unnamed: 1_level_1
https://entrepreneurshandbook.co/,10.0
https://levelup.gitconnected.com/,10.0
https://medium.com/geekculture,10.0
https://medium.com/swlh,10.0
https://python.plainenglish.io/,10.0
https://towardsdatascience.com/,8.75
https://humanparts.medium.com/,4.0


In [13]:
# What are the average reading times per publication?
df.groupby("publication_url")["reading_time"].mean().sort_values(ascending=False)

publication_url
https://entrepreneurshandbook.co/    10.125000
https://towardsdatascience.com/       7.942857
https://humanparts.medium.com/        7.187500
https://levelup.gitconnected.com/     6.650000
https://medium.com/swlh               6.125000
https://medium.com/geekculture        5.800000
https://python.plainenglish.io/       5.625000
Name: reading_time, dtype: float64

In [14]:
# does authors write for more than 1 publication?
df.groupby("author")["publication_url"].apply(lambda x: len(set(x))).sort_values(ascending=False)

author
Ari Joury, PhD                           3
Tim Denning                              2
Youssef Hosni                            2
Ahmed Besbes                             2
Nitin Sharma                             2
                                        ..
E. Black                                 1
Dinesh Kumar K B                         1
Devansh- Machine Learning Made Simple    1
Desiree Peralta                          1
Zulie Rane                               1
Name: publication_url, Length: 117, dtype: int64

In [15]:
# how many articles an author write per month?
number_of_articles_authors = (df["author"].value_counts() / len(df["month"].unique())).sort_values(ascending=False)
number_of_articles_authors = number_of_articles_authors.reset_index()
number_of_articles_authors["avg. articles"] = number_of_articles_authors["author"]
number_of_articles_authors["author"] = number_of_articles_authors["index"]
number_of_articles_authors.drop(columns="index", inplace=True)
number_of_articles_authors

Unnamed: 0,author,avg. articles
0,Tim Denning,8.50
1,The PyCoach,4.00
2,Haider Imtiaz,2.75
3,Arthur Hayes,2.25
4,Farhan Tanvir,1.75
...,...,...
112,Mohammed Ayar,0.25
113,Sanjay Priyadarshi,0.25
114,Matt Welsh,0.25
115,Carlos Arguelles,0.25


In [16]:
average_reading_time = df.groupby("author")["reading_time"].mean().sort_values(ascending=False).reset_index()
average_reading_time

Unnamed: 0,author,reading_time
0,Shailey Dash,22.000000
1,Arthur Hayes,20.777778
2,Jude Ellison S. Doyle,17.000000
3,Kondah Mouad,15.000000
4,Md. Zubair,14.000000
...,...,...
112,Paul Greenberg,3.000000
113,Amal Hasni,3.000000
114,Richard Taujenis,3.000000
115,Yancy Dennis,2.333333


In [17]:
# popularity of authors
authors_popularity = df.groupby("author")["claps"].mean().reset_index()
authors_popularity

Unnamed: 0,author,claps
0,"Aaron Dinin, PhD",957.000000
1,Aayush Malik,269.000000
2,Adejumo Ridwan Suleiman,389.000000
3,Ahmed Besbes,495.000000
4,Al Anany,494.000000
...,...,...
112,Wenting Zhang,695.000000
113,"Wouter van Heeswijk, PhD",403.000000
114,Yancy Dennis,299.000000
115,Youssef Hosni,1069.666667


In [18]:
# which publications does authors use more?

# this will work only and only if there's a value_counts as it returns the first row of every group
# since value_counts returns a sorted structure, just pick the first one after grouping by the publication urls
a = df.groupby("author")["publication_url"].value_counts().groupby(level=0).head(1)
display(a)

a = a.reset_index(0)
a["most_published_single_publication"] = a["publication_url"]
a.drop(columns="publication_url", inplace=True)
a = a.reset_index()
a

author                    publication_url                  
Aaron Dinin, PhD          https://entrepreneurshandbook.co/    1
Aayush Malik              https://towardsdatascience.com/      1
Adejumo Ridwan Suleiman   https://python.plainenglish.io/      3
Ahmed Besbes              https://levelup.gitconnected.com/    1
Al Anany                  https://entrepreneurshandbook.co/    6
                                                              ..
Wenting Zhang             https://medium.com/geekculture       1
Wouter van Heeswijk, PhD  https://towardsdatascience.com/      1
Yancy Dennis              https://python.plainenglish.io/      3
Youssef Hosni             https://medium.com/geekculture       2
Zulie Rane                https://medium.com/swlh              1
Name: publication_url, Length: 117, dtype: int64

Unnamed: 0,publication_url,author,most_published_single_publication
0,https://entrepreneurshandbook.co/,"Aaron Dinin, PhD",1
1,https://towardsdatascience.com/,Aayush Malik,1
2,https://python.plainenglish.io/,Adejumo Ridwan Suleiman,3
3,https://levelup.gitconnected.com/,Ahmed Besbes,1
4,https://entrepreneurshandbook.co/,Al Anany,6
...,...,...,...
112,https://medium.com/geekculture,Wenting Zhang,1
113,https://towardsdatascience.com/,"Wouter van Heeswijk, PhD",1
114,https://python.plainenglish.io/,Yancy Dennis,3
115,https://medium.com/geekculture,Youssef Hosni,2


In [19]:
# is there a correlation between average claps and reading time?
authors = average_reading_time.merge(authors_popularity, how="right", on="author")
authors = authors.merge(number_of_articles_authors, how="right", on="author")
authors = authors.merge(a, how="right", on="author")
authors = authors.sort_values(by="claps", ascending=False)

authors["earned"] = authors["claps"] * 0.1  # minimum approximation, generally ranges between 0.01-5 usd but it's rather higher than 0.1

authors[authors["avg. articles"] >= 1]  # removing the outliers (somebody got 55 -> 11k claps, which is clearly luck)

Unnamed: 0,author,reading_time,claps,avg. articles,publication_url,most_published_single_publication,earned
106,Tim Denning,5.941176,2953.352941,8.5,https://medium.com/swlh,24,295.335294
55,Jano le Roux,5.25,2692.5,1.0,https://medium.com/swlh,4,269.25
21,Arthur Hayes,20.777778,2422.222222,2.25,https://entrepreneurshandbook.co/,9,242.222222
98,Simon Holdorf,11.0,1212.25,1.0,https://levelup.gitconnected.com/,4,121.225
104,The PyCoach,5.5625,1093.9375,4.0,https://medium.com/geekculture,11,109.39375
64,Joseph Mavericks,9.5,732.0,1.0,https://entrepreneurshandbook.co/,4,73.2
23,Avi Chawla,6.4,712.2,1.25,https://towardsdatascience.com/,5,71.22
4,Al Anany,6.333333,494.0,1.5,https://entrepreneurshandbook.co/,6,49.4
72,Liu Zuo Lin,4.571429,292.285714,1.75,https://python.plainenglish.io/,7,29.228571
42,Farhan Tanvir,4.857143,267.571429,1.75,https://medium.com/geekculture,4,26.757143


In [39]:
authors[authors["avg. articles"] >= 1].columns

Index(['author', 'reading_time', 'claps', 'avg. articles', 'publication_url',
       'most_published_single_publication', 'earned'],
      dtype='object')

In [67]:
# export the final df about the authors
def postprocess(df):
    columns = ["reading_time", "claps", "earned"]
    df_copy = df.copy()
    for c in columns:
        df_copy[c] = df[c].copy().round(decimals=2)

    return df_copy


postprocess(authors[authors["avg. articles"] >= 1]).to_csv("authors.csv", index=False)

In [21]:
px.bar(df.groupby("reading_time")["claps"].mean().reset_index(), x="reading_time", y="claps")

In [None]:
# after analyzing the two plots above, I concluded that there isn't a general correlation between the number of claps and reading time
# however, there could be a correlation between the two if we consider individual publications. It may also depend from author to author.