# DESCRIPTION

This is an exploratory analysis of some fake news article collated by Buzzfeed and the amount of Facebook Engagement they garnered. 

The raw data was extracted from BuzzFeed's github repositories: https://github.com/BuzzFeedNews/2018-12-fake-news-top-50/tree/master
Please check out their github profile for better context of the data and the data extraction method.

In [None]:
## importing of modules

import pandas as pd
import numpy as np
import matplotlib as plotly
import datetime as dt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data Cleaning

In [None]:
## Loading csv into dataframe

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Fake News/fakenews_article.csv")

df = pd.DataFrame(df)

In [None]:
## Identifying null values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13961 entries, 0 to 13960
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           13960 non-null  object
 1   url             13961 non-null  object
 2   site_name       13961 non-null  object
 3   fb_engagement   13961 non-null  object
 4   published_date  13961 non-null  object
 5   category        11854 non-null  object
dtypes: object(6)
memory usage: 654.5+ KB


In [None]:
## basic information about the dataframe

df.describe()

Unnamed: 0,title,url,site_name,fb_engagement,published_date,category
count,13960,13961,13961,13961.0,13961,11854
unique,12372,12971,74,2377.0,364,165
top,Neon Nettle,https://newspunch.com/hundreds-children-rescue...,yournewswire.com,0.0,7/20/2018,Sean Adl-tabatabai
freq,4,2,3555,3087.0,537,2835


In [None]:
##Correcting the type for column (fb_engagement)

df["fb_engagement"] = df["fb_engagement"].str.replace(".00",'')

df["fb_engagement"] = df["fb_engagement"].str.replace(",",'')

df["fb_engagement"] = df["fb_engagement"].replace('',0)

df["fb_engagement"] = df["fb_engagement"].astype(int)

  df["fb_engagement"] = df["fb_engagement"].str.replace(".00",'')


In [None]:
## We'll remove articles with at least 1000 engagements. 1000 is a safe space to cut-off, otherwise the data will get skewed towards 0 as there are a lot of low-engagement articles.

df = df[df["fb_engagement"] > 1000]

In [None]:
## We will be removing anything with the category of 'Politics' as this is a broad topic that can mess up our analysis when finding patterns related to categories.

df = df[df["category"] != "Politics"]

In [None]:
## Identifying outliers

q1 = df["fb_engagement"].describe()['25%']
q3 = df["fb_engagement"].describe()['75%']

iqr = (q3 - q1)

lf = q1 - (1.5 * iqr)
uf= q3 + (1.5 * iqr)

print(f'25th Quartile: {q1}')
print(f'75th Quartile: {q3}')
print(f'Interquartile range: {iqr}')
print(f'Lower Fence: {lf}')
print(f'Upper Fence: {uf}')

print(f'The quartiles are low due to the number of articles analyzed. There are articles that are not necessary in our data analysis as of the moment.')

25th Quartile: 1750.25
75th Quartile: 10252.0
Interquartile range: 8501.75
Lower Fence: -11002.375
Upper Fence: 23004.625
The quartiles are low due to the number of articles analyzed. There are articles that are not necessary in our data analysis as of the moment.


In [None]:
## We'll set aside all the articles within the lower and upper fence to another dataframe

main_df = df[(df["fb_engagement"] > lf) & (df["fb_engagement"] < uf)]

In [None]:
## Changing the published date to datetime data type for further analysis

main_df["published_date"] = pd.to_datetime(main_df["published_date"],format='%m/%d/%Y')

## Creating another column called "elapsed_days" which indicates how many days have passed since the publishing date and December 28, 2023, when BuzzFeed released the dataset.

release_date = "12/28/2023"
release_date = pd.to_datetime(release_date,format='%m/%d/%Y')

main_df["elapsed_days"] = (release_date - main_df["published_date"]).dt.days

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["published_date"] = pd.to_datetime(main_df["published_date"],format='%m/%d/%Y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["elapsed_days"] = (release_date - main_df["published_date"]).dt.days


In [None]:
## add another column for engagement_per_day dividing the number of FB engagements by the elapsed days.

main_df["engagement_per_day"] = main_df["fb_engagement"]/main_df["elapsed_days"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["engagement_per_day"] = main_df["fb_engagement"]/main_df["elapsed_days"]


In [None]:
## add another column for the month name when the article was published.

main_df["month_published"] = main_df["published_date"].dt.strftime('%B')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["month_published"] = main_df["published_date"].dt.strftime('%B')


In [None]:
## add another column for engagement_per_month denoting the number of FB engagements received for every 30 days since the publishing of the article.

main_df["engagement_per_month"] = main_df["fb_engagement"]/(main_df["elapsed_days"]/30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["engagement_per_month"] = main_df["fb_engagement"]/(main_df["elapsed_days"]/30)


In [None]:
## add another column for the length of the title.

main_df["title_length"] = main_df["title"].apply(len)
main_df["title_length"] = main_df["title_length"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["title_length"] = main_df["title"].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["title_length"] = main_df["title_length"].astype(float)


In [None]:
## The "category" column has a lot of null values. We will be replacing them wih the phrase "Uncategorized" for uniformity.

main_df["category"] = main_df["category"].fillna("Uncategorized")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["category"] = main_df["category"].fillna("Uncategorized")


In [None]:
main_df = main_df.sort_values(by=["fb_engagement"],ascending=False)
display(main_df)

Unnamed: 0,title,url,site_name,fb_engagement,published_date,category,elapsed_days,engagement_per_day,month_published,engagement_per_month,title_length
260,Anthony Bourdain Was About To Expose Elite Ped...,https://yournewswire.com/anthony-bourdain-elit...,yournewswire.com,22975,2018-06-12,Uncategorized,2025,11.345679,June,340.370370,66.0
261,Anthony Bourdain's Mother: My Son Would Not Co...,https://yournewswire.com/anthony-bourdains-mot...,yournewswire.com,22934,2018-06-12,Uncategorized,2025,11.325432,June,339.762963,58.0
262,"Lisa Page: Trump Is Right, Mueller Probe Is Wi...",https://yournewswire.com/lisa-page-mueller-pro...,yournewswire.com,22792,2018-07-21,Uncategorized,1986,11.476334,July,344.290030,54.0
263,Untersuchungsausschuss 2018. MERKEL IM MAI VOR...,http://www.24aktuelles.com/5ac133713d3d7/unter...,www.24aktuelles.com,22780,2018-04-02,Uncategorized,2096,10.868321,April,326.049618,54.0
264,Keanu Reeves: Humans About To Break Free From ...,https://newspunch.com/keanu-reeves-breaking-fr...,newspunch.com,22778,2018-02-02,Uncategorized,2155,10.569838,February,317.095128,58.0
...,...,...,...,...,...,...,...,...,...,...,...
1736,Oxford University: Satanism 'Fastest Growing R...,https://yournewswire.com/satanism-fastest-grow...,yournewswire.com,1020,2018-01-15,Baxter Dmitry,2173,0.469397,January,14.081914,65.0
1737,This car almost failed its MOT because it had ...,https://uokhun.uk/2018/11/30/this-car-almost-f...,uokhun.uk,1020,2018-11-30,Uncategorized,1854,0.550162,November,16.504854,80.0
1738,Facebook confirms our page owner has been bann...,https://uokhun.uk/2018/11/19/facebook-confirms...,uokhun.uk,1019,2018-11-19,Uncategorized,1865,0.546381,November,16.391421,64.0
1739,China Killed or Imprisoned 18-20 CIA Spies Aft...,http://www.truthandaction.org/china-killed-or-...,www.truthandaction.org,1010,2018-08-29,Uncategorized,1947,0.518747,August,15.562404,86.0


In [None]:
main_df.to_csv("ActualData")

### Data Analysis

In [None]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 260 to 1740
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   title                 1456 non-null   object        
 1   url                   1456 non-null   object        
 2   site_name             1456 non-null   object        
 3   fb_engagement         1456 non-null   int64         
 4   published_date        1456 non-null   datetime64[ns]
 5   category              1456 non-null   object        
 6   elapsed_days          1456 non-null   int64         
 7   engagement_per_day    1456 non-null   float64       
 8   month_published       1456 non-null   object        
 9   engagement_per_month  1456 non-null   float64       
 10  title_length          1456 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(5)
memory usage: 136.5+ KB


In [None]:
main_df.describe()

Unnamed: 0,fb_engagement,elapsed_days,engagement_per_day,engagement_per_month,title_length
count,1456.0,1456.0,1456.0,1456.0,1456.0
mean,4851.451236,2003.763049,2.424315,72.729453,69.737637
std,4790.967099,110.650613,2.392386,71.771567,16.192438
min,1010.0,1845.0,0.469397,14.081914,21.0
25%,1607.5,1903.0,0.810326,24.309778,61.0
50%,2804.0,2002.5,1.411536,42.346069,69.0
75%,6261.75,2098.25,3.155468,94.664035,78.0
max,22975.0,2207.0,11.773091,353.192739,235.0


In [None]:
## This is a brief ranking of FB engagement per category or topic. 

main_df.groupby(["category"]).agg("mean")["fb_engagement"].to_frame().sort_values("fb_engagement",ascending=False)

  main_df.groupby(["category"]).agg("mean")["fb_engagement"].to_frame().sort_values("fb_engagement",ascending=False)


Unnamed: 0_level_0,fb_engagement
category,Unnamed: 1_level_1
De Niro,7459.000000
Uncategorized,7258.314925
Joan Dennison,4836.000000
Breakingnews365.net,4736.500000
Mary Washington,4695.000000
...,...
Daniel Boffey,1216.000000
Robert Costa,1125.000000
Riva Mendoza,1097.000000
Busta Troll,1032.000000


In [None]:
## A ranking of average engagement per day for each category or topic.

main_df.groupby(["category"]).agg("mean")["engagement_per_day"].to_frame().sort_values("engagement_per_day",ascending=False)

  main_df.groupby(["category"]).agg("mean")["engagement_per_day"].to_frame().sort_values("engagement_per_day",ascending=False)


Unnamed: 0_level_0,engagement_per_day
category,Unnamed: 1_level_1
De Niro,3.649217
Uncategorized,3.628213
Joan Dennison,2.462322
Breakingnews365.net,2.431646
Scott Morefield,2.329067
...,...
Janet Farrow,0.623006
Robert Costa,0.604514
Riva Mendoza,0.577672
Obama Spy,0.498780


In [None]:
## A ranking of average engagement per month for each category or topic.

main_df.groupby(["category"]).agg("mean")["engagement_per_month"].to_frame().sort_values("engagement_per_month",ascending=False)

  main_df.groupby(["category"]).agg("mean")["engagement_per_month"].to_frame().sort_values("engagement_per_month",ascending=False)


Unnamed: 0_level_0,engagement_per_month
category,Unnamed: 1_level_1
De Niro,109.476517
Uncategorized,108.846379
Joan Dennison,73.869654
Breakingnews365.net,72.949376
Scott Morefield,69.872022
...,...
Janet Farrow,18.690188
Robert Costa,18.135411
Riva Mendoza,17.330174
Obama Spy,14.963397


In [None]:
## the total number of article titles referencing "Trump" in some way.

trump_count = main_df[main_df["title"].str.contains("trump",case=False)].agg("count")["title"]

## the total number of article titles referencing "Clinton" in some way.
clinton_count = main_df[main_df["title"].str.contains("clinton",case=False)].agg("count")["title"]

print(f'Number of titles mentioning Trump: {trump_count}')
print(f'Number of titles mentioning Clinton: {clinton_count}')

Number of titles mentioning Trump: 175
Number of titles mentioning Clinton: 96


In [None]:
correlation = main_df["title_length"].corr(main_df["fb_engagement"])

print(f'The correlation coefficient between title length and FB engagement is: {correlation}')

The correlation coefficient between title length and FB engagement is: -0.0016302312460418564


In [None]:
## what are the trends for the 25% quartile of the articles?

q1 = main_df["fb_engagement"].describe()["25%"]

q1_df = main_df[main_df["fb_engagement"] <= q1]

q1_df.groupby("category").agg("mean")["fb_engagement"].to_frame().sort_values("fb_engagement",ascending=False)

In [None]:
## for the 25% quartile of the articles, which ones are the fastest to gain engagements?

q1 = main_df["fb_engagement"].describe()["25%"]

q1_df = main_df[main_df["fb_engagement"] <= q1]

q1_df.groupby("category").agg("mean")["engagement_per_day"].to_frame().sort_values("engagement_per_day",ascending=False)

In [None]:
## what are the trends for the 75% quartile of the articles?

q3 = main_df["fb_engagement"].describe()["75%"]

q3_df = main_df[main_df["fb_engagement"] >= q3]

q3_df.groupby("category").agg("mean")["fb_engagement"].to_frame().sort_values("fb_engagement",ascending=False)

In [None]:
## for the 75% quartile of the articles, which ones are the fastest to gain engagements?

q3 = main_df["fb_engagement"].describe()["75%"]

q3_df = main_df[main_df["fb_engagement"] >= q3]

q3_df.groupby("category").agg("mean")["engagement_per_day"].to_frame().sort_values("engagement_per_day",ascending=False)

In [None]:
## comparing the statistics for articles containing the word "clinton" or "trump"
## data for Trump
trump_df = main_df[main_df["title"].str.contains("trump",case=False)]
trump_df.agg("mean").to_frame()

  trump_df.agg("mean").to_frame()
  trump_df.agg("mean").to_frame()


Unnamed: 0,0
fb_engagement,4298.48
elapsed_days,2031.228571
engagement_per_day,2.12537
engagement_per_month,63.761097
title_length,69.891429


In [None]:
## data for Clinton
clinton_df = main_df[main_df["title"].str.contains("clinton",case=False)]
clinton_df.agg("mean").to_frame()

  clinton_df.agg("mean").to_frame()
  clinton_df.agg("mean").to_frame()


Unnamed: 0,0
fb_engagement,5478.864583
elapsed_days,1987.916667
engagement_per_day,2.754189
engagement_per_month,82.625659
title_length,69.104167


In [None]:
## number of articles per website.

main_df.groupby("site_name").agg("count")["title"].to_frame().sort_values("title",ascending=False)

In [None]:
## websites and their performance according to FB Engagement.

main_df.groupby("site_name").agg({"fb_engagement":"mean","engagement_per_day":"mean"}).sort_values("fb_engagement",ascending=False)