In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/train_processed.csv')
df.head()

Unnamed: 0,file_1,file_2,real_text_id
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2


In [3]:
labels = pd.read_csv('../data/train.csv')
labels.head()

Unnamed: 0,id,real_text_id
0,0,1
1,1,2
2,2,1
3,3,2
4,4,2


In [4]:
first = df[['file_1']].join(labels)

In [5]:
second = df[['file_2']].join(labels)

In [6]:
first.head()

Unnamed: 0,file_1,id,real_text_id
0,The VIRSA (Visible Infrared Survey Telescope A...,0,1
1,China\nThe goal of this project involves achie...,1,2
2,Scientists can learn about how galaxies form a...,2,1
3,China\nThe study suggests that multiple star s...,3,2
4,Dinosaur Rex was excited about his new toy set...,4,2


In [7]:
second.head()

Unnamed: 0,file_2,id,real_text_id
0,The China relay network has released a signifi...,0,1
1,The project aims to achieve an accuracy level ...,1,2
2,Dinosaur eggshells offer clues about what dino...,2,1
3,The importance for understanding how stars evo...,3,2
4,Analyzing how fast stars rotate within a galax...,4,2


In [8]:
first['target'] = ["real" if x == 1 else "fake" for x in first['real_text_id']]

In [9]:
first.head()

Unnamed: 0,file_1,id,real_text_id,target
0,The VIRSA (Visible Infrared Survey Telescope A...,0,1,real
1,China\nThe goal of this project involves achie...,1,2,fake
2,Scientists can learn about how galaxies form a...,2,1,real
3,China\nThe study suggests that multiple star s...,3,2,fake
4,Dinosaur Rex was excited about his new toy set...,4,2,fake


In [10]:
second['target'] = ["real" if x == 2 else "fake" for x in second['real_text_id']]

In [11]:
second.head()

Unnamed: 0,file_2,id,real_text_id,target
0,The China relay network has released a signifi...,0,1,fake
1,The project aims to achieve an accuracy level ...,1,2,real
2,Dinosaur eggshells offer clues about what dino...,2,1,fake
3,The importance for understanding how stars evo...,3,2,real
4,Analyzing how fast stars rotate within a galax...,4,2,real


In [12]:
final = pd.concat([first[['file_1', 'target']].rename(columns={'file_1': 'text'}), second[['file_2', 'target']].rename(columns={'file_2': 'text'})])

In [13]:
final.head()

Unnamed: 0,text,target
0,The VIRSA (Visible Infrared Survey Telescope A...,real
1,China\nThe goal of this project involves achie...,fake
2,Scientists can learn about how galaxies form a...,real
3,China\nThe study suggests that multiple star s...,fake
4,Dinosaur Rex was excited about his new toy set...,fake


In [14]:
final.shape

(190, 2)

In [15]:
final.isna().sum()

text      2
target    0
dtype: int64

In [16]:
final.dropna(inplace=True)

In [17]:
final[final['text'].isna()]

Unnamed: 0,text,target


In [18]:
# saving this df for future use
final.to_csv('../data/traditional.csv', index=False)

### Creating ML Model

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix

In [20]:
vectorizer = TfidfVectorizer()

In [21]:
X = vectorizer.fit_transform(final['text'])
y = final['target']

In [22]:
X.shape, y.shape

((188, 9592), (188,))

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((169, 9592), (19, 9592), (169,), (19,))

### Simple Logistic Regression:

In [25]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [26]:
y_pred_log_reg = log_reg.predict(X_test)

In [27]:
print(classification_report(y_test, y_pred_log_reg))

              precision    recall  f1-score   support

        fake       0.80      0.44      0.57         9
        real       0.64      0.90      0.75        10

    accuracy                           0.68        19
   macro avg       0.72      0.67      0.66        19
weighted avg       0.72      0.68      0.67        19



In [28]:
accuracy_score(y_test, y_pred_log_reg)

0.6842105263157895

In [29]:
roc_auc_score(y_test.map({'fake': 0, 'real': 1}), [0 if pred == 'fake' else 1 for pred in y_pred_log_reg])

0.6722222222222222

Load test data and predict

In [31]:
test_dir = "../data/test"

In [32]:
import os

In [34]:
data = []

for folder in os.listdir(test_dir):
    folder_path = os.path.join(test_dir, folder)
    
    if os.path.isdir(folder_path):
        sample_dict = {'sample_id': folder}
        for filename in os.listdir(folder_path):
            filepath = os.path.join(folder_path, filename)
            
            with open(filepath, "r", encoding="utf-8") as file:
                content = file.read()
            sample_dict[filename + "_context"] = content
        data.append(sample_dict)

In [35]:
test = pd.DataFrame(data)
test.head()

Unnamed: 0,sample_id,file_1.txt_context,file_2.txt_context
0,article_0000,"""Music"" Music music music Music music Music mu...",Since its launch on Paranal observatory's Very...
1,article_0001,underground exploration on SN's birth has prov...,SN 1987A provides valuable insights as newer o...
2,article_0002,This research aimed to understand how star sha...,ChromeDriver music player\nThis study focused ...
3,article_0003,Using OmegaCAM's wide field capabilities spann...,"greek translation :\nvazhi (megaCAM), territor..."
4,article_0004,AssemblyCulture AssemblyCulture AssemblyCultur...,XClass is software tool that helps astronomers...


In [37]:
test.isna().sum()

sample_id             0
file_1.txt_context    0
file_2.txt_context    0
dtype: int64

In [40]:
file1_vectorized = vectorizer.transform(test['file_1.txt_context'])
file2_vectorized = vectorizer.transform(test['file_2.txt_context'])

In [41]:
results = pd.DataFrame({
    'id': test['sample_id']
})

results.head()

Unnamed: 0,id
0,article_0000
1,article_0001
2,article_0002
3,article_0003
4,article_0004


In [42]:
results['file_1_results'] = log_reg.predict(file1_vectorized)
results['file_2_results'] = log_reg.predict(file2_vectorized)

In [43]:
results.head()

Unnamed: 0,id,file_1_results,file_2_results
0,article_0000,fake,real
1,article_0001,fake,real
2,article_0002,real,fake
3,article_0003,real,real
4,article_0004,fake,real


In [47]:
results['final_result'] = [
    1 if row['file_1_results'] == "real" else 2
    for _, row in results.iterrows()
]

In [48]:
results.head()

Unnamed: 0,id,file_1_results,file_2_results,final_result
0,article_0000,fake,real,2
1,article_0001,fake,real,2
2,article_0002,real,fake,1
3,article_0003,real,real,1
4,article_0004,fake,real,2


In [49]:
submission = pd.DataFrame({
    'id': range(len(results)),
    'final_result': results['final_result']
})

submission.head()

Unnamed: 0,id,final_result
0,0,2
1,1,2
2,2,1
3,3,1
4,4,2


In [51]:
submission = submission.rename(columns={'final_result': 'real_text_id'})

In [52]:
submission.to_csv('../data/submission.csv', index=False)