In [None]:
#Stage 1 : Installing and Importing requires libraries

In [None]:
# installing requirements
!pip install contractions
!pip install vaderSentiment
!pip install gitpython

In [None]:
# importing needed libraries

# for Web Scraping
import requests
from bs4 import BeautifulSoup

# for general data processing 
import pandas as pd
import numpy as np
import os
from git import Repo

# for data preprocessing
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
import contractions
import string

# for building model
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# for checking accuracy of model
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

# for saving and running saved model
import pickle


In [None]:
# Stage 2: Web Scraping Data

In [None]:
# Getting list of categories to scrap data from 
url = 'https://inshorts.com/en/read/'
urls = []
categories = []
news_categories = []
soup=BeautifulSoup(requests.get(url).content)
datastr = str(soup.find_all('ul', class_=["category-list"]))
datastr=datastr[datastr.index("<a href=\"/en/read/"):datastr.index("</ul>")]
categories=datastr.split("</a>")
for i in range(len(categories)):
  a=categories[i]
  if(len(a)>17):
    j=a.index('\" ',8)
    if(j>17):
      if(i==0):
        news_categories.append(a[18:j])
      else:
        news_categories.append(a[19:j])
for s in news_categories:
  urls.append(url+s+'/')

In [None]:
# building dataset by scraping data from inshorts 
def build_dataset(urls):
  news_data=[]
  for u in urls:
    soup=BeautifulSoup(requests.get(u).content)
    category=u[0:len(u)-1]
    category=category[category.rindex('/')+1:len(category)]
    news_article =[{'news_headline': headline.find('span', attrs={"itemprop":"headline"}).string,
                    'news_article': article.find('div', attrs={"itemprop":"articleBody"}).string,
                    'news_category': category}
                   for headline,article in zip(soup.find_all('div',class_=["news-card-title news-right-box"]),
                                               soup.find_all('div',class_=["news-card-content news-right-box"]))
                   ]
    news_article = news_article[0:20]
    news_data.extend(news_article)
  df=pd.DataFrame(news_data)
  df=df[['news_headline','news_article', 'news_category']]
  return df
df=build_dataset(urls)
df.tail()

In [None]:
# Stage 3 : Data Pre-Processing

In [None]:
# downloading stopwords
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [None]:
# function to remove HTML tag
def html_tag(text):
  soup=BeautifulSoup(text,"html.parser")
  new_text = soup.get_text()
  return new_text

In [None]:
# function to remove contractions
def con(text):
  expand = contractions.fix(text)
  return expand
con("Y'all can't expand I'd think")

In [None]:
# remove special characters
def remove_sp(text):
  pattern= r'[^A-Za-z0-9\s]'
  text= re.sub(pattern,'',text)
  return text

remove_sp("well it is fun !! what @ do you think.")

In [None]:
# function to remove stop words
tokenizer = ToktokTokenizer()
def remove_stopwords(text):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  filtered_tokens = [token for token in tokens if token not in stopword_list]
  filtered_text= ' '.join(filtered_tokens)
  return filtered_text

remove_stopwords("The, and , if are all stop words and even not")

In [None]:
# data pre-processing
df.news_headline = df.news_headline.apply(lambda x:x.lower())
df.news_article = df.news_article.apply(lambda x:x.lower())

df.news_headline = df.news_headline.apply(html_tag)
df.news_article = df.news_article.apply(html_tag)

df.news_headline = df.news_headline.apply(con)
df.news_article = df.news_article.apply(con)

df.news_headline = df.news_headline.apply(remove_sp)
df.news_article = df.news_article.apply(remove_sp)

df.news_headline = df.news_headline.apply(remove_stopwords)
df.news_article = df.news_article.apply(remove_stopwords)

df.head()

In [None]:
# dataset labeling and processing
vs =  SentimentIntensityAnalyzer()
df['compound'] = df['news_headline'].apply(lambda x: vs.polarity_scores(x)['compound'])
df.head()

In [None]:
# data finalization 
def predict(comp):
  comp=float(comp)
  if (comp>0):
    return 'positive'
  else:
    return 'negative'
df['type_pred'] = df['compound'].apply(predict)
df.head()

In [None]:
# saving data to csv
df.to_csv('news.csv',index=False)

In [None]:
# taking ready dataset from Git Hub
df=pd.read_csv('https://raw.githubusercontent.com/kvora125/Sentiment_Analysis_For_News_Headline-Major_Project/main/dataset/news.csv')
df

In [None]:
# dropping not required data and re-indexing the remaining data
df = df.drop(columns=['news_category'], axis=1)
df = df.drop(columns=['news_article'], axis=1)
df = df.set_index(np.arange(len(df)))
df

In [None]:
# removing punctuations
punctuations = list(string.punctuation)
df.news_headline = df.news_headline.apply(lambda x: " ".join(x for x in x.split() if x not in punctuations))
df

In [None]:
# Stage 4: Model Building

In [None]:
# Conversion into Vectors using TFIDF
x = df.iloc[:,0].values
y = df.iloc[:,2].values

In [None]:
# splitting data for training and testing
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
np.unique(x_train,return_counts=True)
np.unique(x_test,return_counts=True)
np.unique(y_train,return_counts=True)
np.unique(y_test,return_counts=True)

In [None]:
# model build and train
final = Pipeline([('Vect',TfidfVectorizer()),
                  ('model',SVC())])
final.fit(x_train,y_train)

In [None]:
# Stage 5 : Checking accuracy of the model

In [None]:
# testing the model using test data
y_pred = final.predict(x_test)

In [None]:
# getting accuracy score and other reports
accuracy_score(y_pred,y_test)

In [None]:
confusion_matrix(y_pred,y_test)

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
# Save the Model
pickle.dump(final,open('sentiment_model.p','wb'))

In [None]:
#runnig saved model from GitHub repo
import pickle
if(os.path.isdir("Sentiment-Analysis-Major-Project")):
  print("repo exists")
else:
  Repo.clone_from("https://github.com/kvora125/Sentiment-Analysis-Major-Project", "/content/Sentiment-Analysis-Major-Project")
final=pickle.load(open('/content/Sentiment-Analysis-Major-Project/sentiment_model.p','rb'))

In [None]:
# testing on random data
print(final.predict(['₹100 fine for not wearing mask in Indore, 6 found infected with UK COVID variant'])[0])

In [None]:
# standalone python deployment to run on custom data and predict (you can directly run this cell)
!pip install gitpython
import os
from git import Repo
import pickle
if(os.path.isdir("Sentiment-Analysis-Major-Project")):
  print("repo exists")
else:
  Repo.clone_from("https://github.com/kvora125/Sentiment-Analysis-Major-Project", "/content/Sentiment-Analysis-Major-Project")
final=pickle.load(open('/content/Sentiment-Analysis-Major-Project/sentiment_model.p','rb'))
# print(final.predict(['₹100 fine for not wearing mask in Indore, 6 found infected with UK COVID variant'])[0])
predicted = (final.predict([str(input('enter a news headline for sentiment analysis : '))])[0])
print('Headline is Predicted to be                  :',predicted)

In [None]:
# stage 6: Deployment as webapp

In [None]:
%%writefile app.py
import streamlit as st
import os
from git import Repo
import pickle
if(os.path.isdir("Sentiment-Analysis-Major-Project")):
  print("repo exists")
else:
  Repo.clone_from("https://github.com/kvora125/Sentiment-Analysis-Major-Project", "/content/Sentiment-Analysis-Major-Project")
sentiment_model=pickle.load(open('/content/Sentiment-Analysis-Major-Project/sentiment_model.p','rb'))
st.title("News Headline Sentiment Analysis")
st.subheader('This project is based on Vader sentiment Analysis(lexicon approach) with TFIFD Vectorizer and SVM to predict sentiment in news healdline')
st.write('This project uses data Web Scraped from inshorts.com and dataset built using prediction by vader sentiment analysis')
message = st.text_area("Enter News Headline","")
if st.button("Predict"):
  st.title(sentiment_model.predict([message])[0])

In [None]:
#WebApp Deployment with ngrok server
!pip install gitpython
!pip install streamlit
!pip install pyngrok
from pyngrok import ngrok
!nohup streamlit run --server.port 80 app.py >/dev/null &
url=ngrok.connect(port='80')
url