-
Notifications
You must be signed in to change notification settings - Fork 1
/
newsclassifier.py
147 lines (119 loc) · 4.74 KB
/
newsclassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from newsapi import NewApiClient
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
#to get api key
newsapi = NewsApiClient(api_key='2a7dd9f4dd8fxxxxxxxxxxxxxxxxx') # Get your API key from NewsAPI
#using the key to get tech articles
tech_articles = newsapi.get_everything(q='tech', language='en', page_size=100)
print(tech_articles)
#checking the keys in the output
print(tech_articles.keys())
#dict_keys(['status', 'totalResults', 'articles'])
#transform to a pandas dataframe
tech = pd.DataFrame(tech_articles['articles'])
print(tech)
#add category to the dataframe
tech['category'] = 'Tech'
print(tech)
#add more categories
entertainment_articles = newsapi.get_everything(q='entertainment',language='en', page_size=100)
business_articles = newsapi.get_everything(q='business',language='en', page_size=100)
sports_articles = newsapi.get_everything(q='sports',language='en', page_size=100)
politics_articles = newsapi.get_everything(q='politics',language='en', page_size=100)
travel_articles = newsapi.get_everything(q='travel',language='en', page_size=100)
food_articles = newsapi.get_everything(q='food',language='en', page_size=100)
health_articles = newsapi.get_everything(q='health',language='en', page_size=100)
#transform them to a dataframe
entertainment = pd.DataFrame(entertainment_articles['articles'])
entertainment['category'] = 'Entertainment'
business = pd.DataFrame(business_articles['articles'])
business['category'] = 'Business'
sports = pd.DataFrame(sports_articles['articles'])
sports['category'] = 'Sports'
politics = pd.DataFrame(politics_articles['articles'])
politics['category'] = 'Politics'
travel = pd.DataFrame(travel_articles['articles'])
travel['category'] = 'Travel'
food = pd.DataFrame(food_articles['articles'])
food['category'] = 'Food'
health = pd.DataFrame(health_articles['articles'])
health['category'] = 'Health
# merge everything into one dataframe
categories = [tech, entertainment, business, sports, politics, travel, food, health]
df = pd.concat(categories)
print(df.info())
# Define the function to clean the news title column
def cleaned_desc_column(text):
# Remove commas
text = re.sub(r',', '', text)
# Remove extra spaces
text = re.sub(r'\s+', ' ', text)
# Remove full stops
text = re.sub(r'\.', '', text)
# Remove single quotes and double quotes
text = re.sub(r"['\"]", '', text)
# Remove other non-word characters
text = re.sub(r'\W', ' ', text)
text_token = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_text = []
for sw in text_token:
if sw not in stop_words:
filtered_text.append(sw)
text = " ".join(filtered_text)
return text
# Apply the clean_text_column function to the text_column in the DataFrame
df['news_title'] = df['title'].apply(cleaned_desc_column)
print(df)
# The cleaned column 'news_title' is added to the dataframe.
#getting the category we need for testing
X = df['news_title']
y = df['category']
#spliting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 90)
print(X_train.shape)
print(X_test.shape)
#creating a pipeline to build the classifier
lr = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(max_iter = 1000)),
])
# Train the logistic regression model on the training set
lr.fit(X_train,y_train)
# Make predictions on the test set
y_pred = lr.predict(X_test)
# Calculate the accuracy of the model
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
#Output
#Accuracy is : 0.7208333333333333
# Test The Model With Different Articles
news = ["Biden to Sign Executive Order That Aims to Make Child Care Cheaper",
"Google Stock Loses $57 Billion Amid Microsoft’s AI ‘Lead’—And Reports It Could Be Replaced By Bing On Some Smartphones",
"Poland suspends food imports from Ukraine to assist its farmers",
"Can AI Solve The Air Traffic Control Problem? Let's Find Out",
"Woman From Odisha Runs 42.5 KM In UK Marathon Wearing A Saree",
"Hillary Clinton: Trump cannot win election - but Biden will",
"Jennifer Aniston and Adam Sandler starrer movie 'Murder Mystery 2' got released on March 24, this year"]
predicted = lr.predict(news)
for doc, category in zip(news, predicted):
print(category)
"""
Health
Tech
Food
Tech
Sports
Politics
Entertainment
"""