# Importing Library

In [268]:
import pandas as pd
from sqlalchemy import create_engine

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Ingesting Data From RDS for Model Training

In [249]:
engine=create_engine('mysql://root:root@localhost:3306/news')
dataset=pd.read_sql("select * from model_training",engine)
dataset.reindex(dataset['index'])
dataset.drop(columns=['index'],inplace=True)

In [251]:
dataset

Unnamed: 0,title,link,summary,author,tags,Category
0,US emergency crews struggle as climate crisis ...,https://www.theguardian.com/us-news/2024/oct/0...,Resources are stretched thin as the south-east...,Gabrielle Canon,US news Extreme weather Climate crisis Hurrica...,Natural Disaster
1,Anger at UK’s ‘bonkers’ plan to reach net zero...,https://www.theguardian.com/environment/2024/o...,Government criticised over list of potential c...,Isabella Kaminski,Biomass and bioenergy North Korea UK news Afgh...,Natural Disaster
2,Hurricane Milton: what does it actually take t...,https://www.theguardian.com/us-news/2024/oct/0...,While Florida residents are being told to flee...,Adria R Walker,Hurricane Milton Extreme weather Hurricanes Fl...,Natural Disaster
3,Hurricanes like Helene twice as likely to happ...,https://www.theguardian.com/environment/2024/o...,Analysis shows Gulf’s heat that worsened Helen...,Oliver Milman and Jonathan Watts,Climate crisis Hurricanes Hurricane Milton Hur...,Natural Disaster
4,China to head green energy boom with 60% of ne...,https://www.theguardian.com/environment/2024/o...,IEA says faster clean energy rollout being led...,Jillian Ambrose,Renewable energy China Energy Energy industry ...,Natural Disaster
...,...,...,...,...,...,...
217,"Prize crossword No 29,506",https://www.theguardian.com/crosswords/prize/2...,,Boatman,Crosswords,Other
218,Observer killer sudoku,https://www.theguardian.com/lifeandstyle/2024/...,Click here to access the print version. Norma...,,Life and style,Other
219,What links Enter the Dragon and Brainstorm? Th...,https://www.theguardian.com/lifeandstyle/2024/...,"From US presidents to burrowing mammals, test ...",Thomas Eaton,Life and style Quiz and trivia games,Other
220,Did you solve it? The box problem that baffled...,https://www.theguardian.com/science/2024/sep/3...,The solution to today’s puzzle Earlier today I...,Alex Bellos,Mathematics Science,Other


# Reshufling dataset - Combine Data For Essay-  and Spliting it

In [262]:
#shuffling
dataset=dataset.sample(frac=1)

#coombine to form essay
combined=dataset['title']+dataset['summary']+dataset['tags']+dataset['author']
total=pd.concat([combined,dataset['Category']],axis=1)

#spliting in train and test early to avoide data leakage
X=total[0]
y=total['Category']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# Preprocessing data

In [269]:
#fill missing value
X_train=X_train.fillna("AAA")
X_test=X_test.fillna("AAA")

#transform data into embedding
transformer=TfidfVectorizer(max_features=1000)
X_train_transformed=transformer.fit_transform(X_train.to_list())
X_test_transformed=transformer.transform(X_test.to_list())

#label encoding Labels
encoder=LabelEncoder()
y_train_transformed=encoder.fit_transform(y_train)
y_test_transformed=encoder.transform(y_test)

# Model Training

In [270]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

clf = MultinomialNB()

# Train the model
clf.fit(X_train_transformed, y_train_transformed)

y_pred_training = clf.predict(X_train_transformed)
y_pred_testing = clf.predict(X_test_transformed)

# Evaluate the model on training data 
print("On Training Data")
print(f"Accuracy: {accuracy_score(y_train_transformed, y_pred_training)}")
print(classification_report(y_train_transformed, y_pred_training))

# Evaluate Model on test data
print("On Testing Data")
print(f"Accuracy: {accuracy_score(y_test_transformed, y_pred_testing)}")
print(classification_report(y_test_transformed, y_pred_testing))

On Training Data
Accuracy: 0.9491525423728814
              precision    recall  f1-score   support

           0       1.00      0.44      0.61        16
           1       1.00      1.00      1.00        47
           2       0.89      1.00      0.94        66
           3       0.98      1.00      0.99        48

    accuracy                           0.95       177
   macro avg       0.97      0.86      0.89       177
weighted avg       0.95      0.95      0.94       177

On Testing Data
Accuracy: 0.9111111111111111
              precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       1.00      1.00      1.00         7
           2       0.78      1.00      0.88        14
           3       1.00      0.95      0.97        20

    accuracy                           0.91        45
   macro avg       0.94      0.80      0.81        45
weighted avg       0.93      0.91      0.90        45



# Making Prediction 

In [306]:
def predict_it(data,estimator):
    data=data['title']+data['summary']+data['tags']+data['author']
    data=data.fillna("AAA")
    data=transformer.transform(data.to_list())
    prediction=estimator.predict(data)
    return prediction

m=enumerate(encoder.classes_)    
y=list(m)
def map_it(x):
    for i in y:
        if i[0]==x:
            return i[1]

def interpret_prediction(prediction):
    x=map(map_it,prediction)
    return list(x)

to_predict_for=daily_article.loc[100:109]
pred=interpret_prediction(predict_it(to_predict_for,clf))
pred

['Terrorism Protest Political Unrest Riot',
 'Positive Uplifting',
 'Other',
 'Other',
 'Other',
 'Terrorism Protest Political Unrest Riot',
 'Other',
 'Terrorism Protest Political Unrest Riot',
 'Positive Uplifting',
 'Other']

# Training with other models

In [108]:
from sklearn.linear_model import  LogisticRegression
lr=LogisticRegression()
lr.fit(X_train_transformed,y_train_transformed)
y_pred=lr.predict(X_train_transformed)

In [109]:
accuracy_score(y_pred,y_train_transformed)

0.9887005649717514

In [110]:
y_pred=lr.predict(X_test_transformed)
accuracy_score(y_pred,y_test_transformed)

0.9555555555555556

In [111]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train_transformed,y_train_transformed)
y_pred=dt.predict(X_train_transformed)
accuracy_score(y_pred,y_train_transformed)

1.0

In [112]:
y_pred=dt.predict(X_test_transformed)
accuracy_score(y_pred,y_test_transformed)

0.8888888888888888

# Importing Realworld Data

In [131]:
daily_article=pd.read_sql("select * from daily_article",engine)
daily_article=daily_article.reindex(index=daily_article['index']).drop(columns=['index'])

In [317]:
data="2020-01-15 09:03:32.744178"
time=data.split()[1]

list(map(lambda x:time[time.find(":")+1:time.find(":")+3],[time]))

['03']