## **AI FOR FAKE NEWS SOURCE PROFILING**

## DATA LOADING

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("AI_news.csv")

In [3]:
df.head()

Unnamed: 0,NEWS,FAKE\REAL
0,Nov 2025 - WASHINGTON (Reuters) - The head of ...,real
1,WASHINGTON (Reuters) - Transgender people will...,real
2,2023 - WASHINGTON (Reuters) - The special coun...,real
3,WASHINGTON (Reuters) - Trump campaign adviser ...,real
4,2024 - SEATTLE/WASHINGTON (Reuters) - Presiden...,real


In [4]:
df.shape

(100057, 2)

In [5]:
df

Unnamed: 0,NEWS,FAKE\REAL
0,Nov 2025 - WASHINGTON (Reuters) - The head of ...,real
1,WASHINGTON (Reuters) - Transgender people will...,real
2,2023 - WASHINGTON (Reuters) - The special coun...,real
3,WASHINGTON (Reuters) - Trump campaign adviser ...,real
4,2024 - SEATTLE/WASHINGTON (Reuters) - Presiden...,real
...,...,...
100052,"CLEVELAND (Reuters) - The mayor of Youngstown,...",real
100053,ATLANTA (Reuters) - Georgia Governor Nathan De...,real
100054,WASHINGTON (Reuters) - U.S. Republican preside...,real
100055,WASHINGTON (Reuters) - Fresh from Democratic p...,real


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100057 entries, 0 to 100056
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   NEWS       100040 non-null  object
 1   FAKE\REAL  100012 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


## DATA CLEANING

In [7]:
df.isnull().sum()

NEWS         17
FAKE\REAL    45
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.duplicated().sum()

np.int64(70)

In [10]:
df=df.drop_duplicates()

In [11]:
df["FAKE\REAL"].value_counts()

  df["FAKE\REAL"].value_counts()


FAKE\REAL
real    52987
fake    46938
Name: count, dtype: int64

In [12]:
df.shape

(99925, 2)

## LABEL ENCODING

In [13]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df["FAKE\REAL"]=encoder.fit_transform(df["FAKE\REAL"])

  df["FAKE\REAL"]=encoder.fit_transform(df["FAKE\REAL"])
  df["FAKE\REAL"]=encoder.fit_transform(df["FAKE\REAL"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["FAKE\REAL"]=encoder.fit_transform(df["FAKE\REAL"])


## TEXT PREPROCCESSING

In [14]:
import re
def clean_NEWS(NEWS):
    if isinstance(NEWS,str):
        NEWS = NEWS.lower()
        NEWS = re.sub(r"http\s+","",NEWS)
        NEWS = re.sub(r"@\w+","",NEWS)
        NEWS = re.sub(r"3\w+","",NEWS)
        NEWS = re.sub(r"[^\w\s]","",NEWS)
        return NEWS
    else:
        return "missing"
    
df["clean_NEWS"]=df["NEWS"].apply(clean_NEWS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["clean_NEWS"]=df["NEWS"].apply(clean_NEWS)


In [15]:
df.drop(columns="NEWS",axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns="NEWS",axis=1,inplace=True)


In [16]:
df

Unnamed: 0,FAKE\REAL,clean_NEWS
0,1,nov 2025 washington reuters the head of a co...
1,1,washington reuters transgender people will be...
2,1,2023 washington reuters the special counsel ...
3,1,washington reuters trump campaign adviser geo...
4,1,2024 seattlewashington reuters president don...
...,...,...
100037,1,cleveland reuters the mayor of youngstown ohi...
100038,1,atlanta reuters georgia governor nathan deal ...
100039,1,washington reuters us republican presidential...
100040,1,washington reuters fresh from democratic pres...


## FEATURE EXTRACTION

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer =TfidfVectorizer(stop_words="english",max_features=5000)
x=vectorizer.fit_transform(df["clean_NEWS"])
y=df["FAKE\REAL"]

  y=df["FAKE\REAL"]


## SAVE VECTORIZER

In [18]:
import pickle 

with open('vectorizer.pkl', 'wb') as file:
        pickle.dump(vectorizer,file)

## Train-Test Split

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

## Model Training

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
grid_dict={"criterion":["entropy","gini"],
           "max_depth":[5,8,10,15],
           "min_samples_split":[2,4,5,10],
           "n_estimators":[5,10,15,20]}
rand_fost=GridSearchCV(RandomForestClassifier(random_state=42),grid_dict,cv=5,verbose=3,n_jobs=-1,scoring="accuracy")
rand_fost.fit(x_train,y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'criterion': ['entropy', 'gini'], 'max_depth': [5, 8, ...], 'min_samples_split': [2, 4, ...], 'n_estimators': [5, 10, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,10
,criterion,'entropy'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Prediction

In [21]:
y_pred=rand_fost.predict(x_test)

In [22]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,classification_report
acc=accuracy_score(y_pred,y_test)
pre=precision_score(y_pred,y_test)
rec=recall_score(y_pred,y_test)
f1=f1_score(y_pred,y_test)
report=classification_report(y_pred,y_test)
print()
print("accuracy : ",acc)
print("precision : ",pre)
print("recall : ",rec)
print("f1 : ",f1)
print()
print("report")
print(report)


accuracy :  0.9947960970728046
precision :  0.998578064271495
recall :  0.9916219523675045
f1 :  0.9950878518798413

report
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9362
           1       1.00      0.99      1.00     10623

    accuracy                           0.99     19985
   macro avg       0.99      1.00      0.99     19985
weighted avg       0.99      0.99      0.99     19985



In [23]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [24]:
y_pred=log_reg.predict(x_test)

In [25]:
log_reg.predict_proba(x_test)[:, 1]

array([0.00495454, 0.99855968, 0.07632486, ..., 0.23874639, 0.01985979,
       0.99712726], shape=(19985,))

In [26]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,classification_report
accuracy=accuracy_score(y_pred,y_test)
precision=precision_score(y_pred,y_test)
recall=recall_score(y_pred,y_test)
f1=f1_score(y_pred,y_test)
report=classification_report(y_pred,y_test)
print()
print("accuracy - ",accuracy)
print("precision - ",precision)
print("recall - ",recall)
print("f1 - ",f1)
print()
print("report")
print(report)


accuracy -  0.9943457593194897
precision -  0.9972509242582235
recall -  0.992078460958129
f1 -  0.9946579681369073

report
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9381
           1       1.00      0.99      0.99     10604

    accuracy                           0.99     19985
   macro avg       0.99      0.99      0.99     19985
weighted avg       0.99      0.99      0.99     19985



In [27]:
from sklearn.naive_bayes import MultinomialNB
naive_model=MultinomialNB()
naive_model.fit(x_train,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [28]:
y_pred=naive_model.predict(x_test)

In [29]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,classification_report
accuracy=accuracy_score(y_pred,y_test)
precision=precision_score(y_pred,y_test)
recall=recall_score(y_pred,y_test)
f1=f1_score(y_pred,y_test)
report=classification_report(y_pred,y_test)
print()
print("accuracy - ",accuracy)
print("precision - ",precision)
print("recall - ",recall)
print("f1 - ",f1)
print()
print("report")
print(report)


accuracy -  0.9250437828371278
precision -  0.9519385723765286
recall -  0.9101785552433608
f1 -  0.9305903067370958

report
              precision    recall  f1-score   support

           0       0.89      0.94      0.92      8952
           1       0.95      0.91      0.93     11033

    accuracy                           0.93     19985
   macro avg       0.92      0.93      0.92     19985
weighted avg       0.93      0.93      0.93     19985



In [30]:
import pickle 

with open('naive_model.pkl', 'wb') as file:
        pickle.dump(naive_model,file)

In [31]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
grid_dict={"C":[0.2,0.5,0.8,1,2],
           "kernel":["rbf","poly"],
           "degree":[2,4,5]}

svc=SVC()
svc.fit(x_train,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [32]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [33]:
y_preds=svc.predict(x_test)

In [34]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,classification_report
acc=accuracy_score(y_preds,y_test)
pre=precision_score(y_preds,y_test)
rec=recall_score(y_preds,y_test)
f1=f1_score(y_preds,y_test)
report=classification_report(y_preds,y_test)
print()
print("accuracy : ",acc)
print("precision : ",pre)
print("recall : ",rec)
print("f1 : ",f1)
print()
print("report")
print(report)


accuracy :  0.9989992494370779
precision :  0.9990520428476632
recall :  0.9990520428476632
f1 :  0.9990520428476632

report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9436
           1       1.00      1.00      1.00     10549

    accuracy                           1.00     19985
   macro avg       1.00      1.00      1.00     19985
weighted avg       1.00      1.00      1.00     19985



In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
clas_models={"LCR":LogisticRegression(max_iter=500),
             "RFC":RandomForestClassifier(),
             "KNN":KNeighborsClassifier(n_neighbors=11),
             "SVM":SVC(kernel="rbf"),
             "GBC":GradientBoostingClassifier(random_state=0,learning_rate=0.45),
             
             "NB":MultinomialNB()}

In [None]:
results={"model":[],"accuracy":[],"precision":[],"recall":[],"f1":[]}

for model_name,model in clas_models.items():

    model.fit(x_train,y_train)

    y_pred=model.predict(x_test)

    accuracy=accuracy_score(y_pred,y_test)
    precision=precision_score(y_pred,y_test)
    recall=recall_score(y_pred,y_test)
    f1=f1_score(y_pred,y_test)

    results["model"].append(model_name)
    results["accuracy"].append(accuracy)
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1"].append(f1)

result_df=pd.DataFrame(results)
result_df


In [None]:
result_df=result_df.sort_values(by="accuracy",ascending=False)

result_df.style.background_gradient("Reds")

Unnamed: 0,model,accuracy,precision,recall,f1
1,RFC,0.99935,0.999336,0.999431,0.999384
3,SVM,0.998999,0.999052,0.999052,0.999052
4,GBC,0.998499,0.999147,0.998012,0.998579
0,LCR,0.994346,0.997251,0.992078,0.994658
5,NB,0.925044,0.951939,0.910179,0.93059
2,KNN,0.686715,0.428003,0.95213,0.590543
