<a href="https://colab.research.google.com/github/anuj-glitch/news_virality/blob/master/news_virality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ***Crawl news & information websites & anticipate the likelihood of its virality. ***

In [89]:
#Importing Required Libraries
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import re
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split


# Function to scrap headlines from the website www.ndtv.com
def webScrapper(pageNo,url,query=''):
  data = []
  for i in range (pageNo):
    if len(query) > 0: 
      urlFinal = url + str(i) + "&query=" + query
    else:
      urlFinal = url + str(i)
    r = requests.get(urlFinal)
    soup = BeautifulSoup(r.content, 'html5lib') 
    table = soup.findAll('p', attrs = {'class':'header fbld'})
    for row in table:
      data.append(row.get_text())
  return data


pageNo = 100
url = "https://www.ndtv.com/page/topic-load-more?%20type=news&page="
headLinesViral  = webScrapper(pageNo,url,'viral') #Viral Headlines
headLinesLatest = webScrapper(pageNo,url)         #Not viral, Latest Headlines
headLinesTest   = webScrapper(25,url,'india')     #Test Headlines from india


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [90]:
# to remove common news observation in train and test set 
res=[]
for i in headLinesLatest:
    if i not in res:
      res.append(i)

headLinesLatest = res

print(len(headLinesViral),len(headLinesLatest),len(headLinesTest))
Y =  np.asarray(len(headLinesViral)*[1] + len(headLinesLatest)*[0]) #

1015 998 375


In [0]:

newsData = pd.DataFrame(headLinesViral + headLinesLatest,columns =['News_Data'])
newsData_test = pd.DataFrame(headLinesTest,columns =['News_Data'])

In [0]:
#Function to remove non-alphanumericals
def remove_special_characters(data):
  newData = []
  for text in data:
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    newData.append(text)
  return newData

In [0]:
# Lemmatizing
def get_lemmatized_text(corpus):
    return [' '.join([lemmatizer.lemmatize(word, pos="v") for word in review.split()]) for review in corpus]

In [95]:
def process_data(data):
  data_wsc = remove_special_characters(data)
  lemm_data = get_lemmatized_text(data_wsc)
  return(lemm_data)

# Getting processed Data
newsData['News_Data'] = process_data(newsData['News_Data'])
newsData_test['News_Data']= process_data(newsData_test['News_Data'])

newsData.head()

Unnamed: 0,News_Data
0,What Does The Perfect Cup Of Tea Look Like Red...
1,Internets Dil To Pagal Hai For This Throwback ...
2,US Official Resigns After Drinking Beer Throwi...
3,Viral A Petition To UninstallWhatsApp For Amit...
4,Viral Video Of Dolphins In Meerut Stuns Intern...


In [96]:

# Function for returning TF - IDF (Term Frequency — Inverse Document Frequency) vectoriser the given data

def TF_idf(review,review_test):
  tfidf_vectorizer = TfidfVectorizer()
  tfidf_vectorizer.fit(review)
  X = tfidf_vectorizer.transform(review)
  X_test = tfidf_vectorizer.transform(review_test)
  return(X,X_test)

#spiltting the datapoints into train and validation set

X,X_test = TF_idf(newsData['News_Data'],newsData_test['News_Data'])
X_train,X_val,y_train,y_val = train_test_split(X, Y, train_size=0.7)

#Logistic regression model to train the data

lr_model = LogisticRegression(C=1)
lr_model.fit(X_train,y_train)
lr_predict = lr_model.predict(X_val)

#Validating the model's accuracy of Logistic Regression

print('Validation F1-score : ' + str(f1_score(lr_predict,y_val)))
print('Validation Accuracy : ' + str(accuracy_score(lr_predict,y_val)))

Validation F1-score : 0.8517241379310344
Validation Accuracy : 0.8576158940397351


In [97]:
lr_model.fit(X,Y)
y_test_predict = lr_model.predict_proba(X_test)[:,1]
a=[]
for b in y_test_predict:
    a.append(round(b*100,2)) 

y_predict_df = pd.DataFrame({'News':headLinesTest,'Virality chances': a })
y_predict_df.to_csv('Virality_Predictions.csv')
y_predict_df.head()

Unnamed: 0,News,Virality chances
0,"RedmiBook, Mi-Branded Laptops Said to Launch i...",21.4
1,Realme Watch Tipped to Come With 1.4-Inch Disp...,34.87
2,Children In South Asia Could Face Health Crisi...,15.83
3,"13,448 Industrial Units Get Permission To Rest...",25.32
4,"Uddhav Thackeray, Unelected, Has A Month To Qu...",28.61
