# Advanced Fake News Detection Model

## Klagøre datasættet
Indlæser og klargøre datasættet.

Først indlæser jeg hele FULL datasættet, men kun kolonnerne ``type`` og ``cleaned content``, da vi ikke må bruge meta-data.

Herefter inddeler jeg typerne under 2 klasser ``fake`` og ``reliable``, og sletter de klasser som ikke kan bruges.

In [1]:
import pandas as pd

# Reading the cleaned dataset
dataset = pd.read_csv('../cleaned_dataset_FULL.csv', usecols=['type','cleaned content'], nrows=int(995000*0.9))

# removing unwanted data
dataset = dataset.dropna(subset=['cleaned content'])
dataset = dataset[~dataset['type'].isin(['unknown', 'unreliable'])].dropna(subset=['type'])

# removing dublicate data
dataset = dataset.drop_duplicates(subset='cleaned content', keep='first')

# creating classes
dataset = dataset.replace({'conspiracy':'fake', 'junksci':'fake', 'satire':'fake', 'bias':'fake', 'state':'fake', 'hate':'fake', 'rumor':'fake'})
dataset = dataset.replace({'clickbait':'reliable', 'political':'reliable'})

dataset

Unnamed: 0,type,cleaned content
0,reliable,"['plu', 'one', 'articl', 'googl', 'plu', 'than..."
1,fake,"['cost', 'best', 'senat', 'bank', 'committe', ..."
2,fake,"['man', 'awoken', '<NUM>', 'year', 'coma', 'co..."
3,reliable,"['julia', 'geist', 'ask', 'draw', 'pictur', 'c..."
4,fake,"['<NUM>', 'compil', 'studi', 'vaccin', 'danger..."
...,...,...
895494,fake,"['poll', 'focus', 'florida', 'ohio', 'pennsylv..."
895495,reliable,"['side', 'ukrainian', 'govern', 'offici', 'sup..."
895496,reliable,"['look', 'like', 'ufc', 'fighter', 'josh', 'ko..."
895497,reliable,"['market', 'news', '|', 'tue', '<DATE>', '|', ..."


Det fås at ud fra denne klasse fordeling, er datasættet praktisk talt balanceret (48% vs 52%).

Herefter opdeller jeg datasættet i træning og validation.

In [2]:
from sklearn.model_selection import train_test_split

# Opdel i trænings- og testdatasæt
temp = train_test_split(dataset['cleaned content'], dataset['type'], test_size=0.2, random_state=50, stratify=dataset['type'])

Training_Data = pd.DataFrame({'X':temp[0], 'Y':temp[2]})
Test_Data = pd.DataFrame({'X':temp[1], 'Y':temp[3]})

Training_Data

Unnamed: 0,X,Y
31836,"['us', 'reward', 'inform', 'daesh', 'leader', ...",fake
300735,"['wrote', 'coupl', 'week', 'back', 'see', 'inc...",fake
333657,"['decis', 'far', 'reach', 'effect', 'across', ...",reliable
116099,"['via|', 'clinton', 'foundat', 'tie', 'extrem'...",fake
474742,"['last', 'week', 'professor', 'michael', 'mcco...",reliable
...,...,...
454940,"['recent', 'yellowston', 'experienc', 'number'...",fake
393285,"['follow', 'equiti', 'convert', 'debt', 'offer...",reliable
539987,"['editor', 'peopl', 'america', 'cannot', 'crea...",reliable
589483,"['us', 'share', 'end', 'oil', 'bank', 'resili'...",fake


## Input Repræsentationer

Importere ``CountVectorizer``, til at omdanne tokens til numerisk data, som kan trænes på.

In [59]:
from sklearn.feature_extraction.text import CountVectorizer

### Count Vector

In [60]:
CountVector = CountVectorizer(
    binary=True, 
    max_features=None).fit(Training_Data['X'])

CountVector_matrix = CountVector.transform(Training_Data['X'])

In [61]:
Bigram_CountVector = CountVectorizer(
    binary=True,
    ngram_range=(2, 2), 
    max_features=None).fit(Training_Data['X'])

Bigram_CountVector_matrix = Bigram_CountVector.transform(Training_Data['X'])

In [62]:
Multigram_CountVector = CountVectorizer(
    binary=True, 
    ngram_range=(1, 2), 
    max_features=None).fit(Training_Data['X'])

Multigram_CountVector_matrix = Multigram_CountVector.transform(Training_Data['X'])

### Bag Of Words

In [None]:
BagOfWords = CountVectorizer(
    binary=False, 
    max_features=None).fit(Training_Data['X'])

BagOfWords_matrix = BagOfWords.transform(Training_Data['X'])

In [None]:
Bigram_BagOfWords = CountVectorizer(
    binary=False, 
    ngram_range=(2, 2), 
    max_features=None).fit(Training_Data['X'])

Bigram_BagOfWords_matrix = Bigram_BagOfWords.transform(Training_Data['X'])

In [None]:
Multigram_BagOfWords = CountVectorizer(
    binary=False, 
    ngram_range=(1, 2), 
    max_features=None).fit(Training_Data['X'])

Multigram_BagOfWords_matrix = Multigram_BagOfWords.transform(Training_Data['X'])

### TF-IDF transformation

In [64]:
from sklearn.feature_extraction.text import TfidfTransformer

In [65]:
TFIDF_CountVector = TfidfTransformer().fit(CountVector_matrix)

TFIDF_Bigram_CountVector = TfidfTransformer().fit(Bigram_CountVector_matrix)

TFIDF_Multigram_CountVector = TfidfTransformer().fit(Multigram_CountVector_matrix)

In [None]:
TFIDF_BagOfWords = TfidfTransformer().fit(BagOfWords_matrix)

TFIDF_Bigram_BagOfWords = TfidfTransformer().fit(Bigram_BagOfWords_matrix)

TFIDF_Multigram_BagOfWords = TfidfTransformer().fit(Multigram_BagOfWords_matrix)

### Chi feature selection

In [None]:
from sklearn.feature_selection import SelectFpr, chi2

In [97]:
Chi_CountVector = SelectFpr(chi2, alpha=0.42).fit(CountVector_matrix, Training_Data['Y'])

Chi_Bigram_CountVector = SelectFpr(chi2, alpha=0.42).fit(Bigram_CountVector_matrix, Training_Data['Y'])

Chi_Multigram_CountVector = SelectFpr(chi2, alpha=0.42).fit(Multigram_CountVector_matrix, Training_Data['Y'])

In [None]:
Chi_BagOfWords = SelectFpr(chi2, alpha=0.42).fit(BagOfWords_matrix, Training_Data['Y'])

Chi_Bigram_BagOfWords = SelectFpr(chi2, alpha=0.42).fit(Bigram_BagOfWords_matrix, Training_Data['Y'])

Chi_Multigram_BagOfWords = SelectFpr(chi2, alpha=0.42).fit(Multigram_BagOfWords_matrix, Training_Data['Y'])

In [None]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(selector.transform(CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(selector.transform(CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8456939836633383
Reliable : 0.8608667032364234


In [96]:
from sklearn.feature_selection import SelectFpr, chi2

selector = SelectFpr(chi2, alpha=0.005).fit(Multigram_CountVector_matrix, Training_Data['Y'])

# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(selector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(selector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8871499573756253
Reliable : 0.8965679915084326


In [95]:
from sklearn.feature_selection import SelectFpr, chi2

selector = SelectFpr(chi2, alpha=0.01).fit(Multigram_CountVector_matrix, Training_Data['Y'])

# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(selector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(selector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8877378673572796
Reliable : 0.8971195235568135


In [94]:
from sklearn.feature_selection import SelectFpr, chi2

selector = SelectFpr(chi2, alpha=0.05).fit(Multigram_CountVector_matrix, Training_Data['Y'])

# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(selector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(selector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8900496591466998
Reliable : 0.8993731538977158


In [93]:
from sklearn.feature_selection import SelectFpr, chi2

selector = SelectFpr(chi2, alpha=0.1).fit(Multigram_CountVector_matrix, Training_Data['Y'])

# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(selector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(selector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8907518446477084
Reliable : 0.8997580978228804


In [90]:
from sklearn.feature_selection import SelectFpr, chi2

selector = SelectFpr(chi2, alpha=0.42).fit(Multigram_CountVector_matrix, Training_Data['Y'])

# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(selector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(selector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8937267949274315
Reliable : 0.9024577346354704


In [92]:
from sklearn.feature_selection import SelectFpr, chi2

selector = SelectFpr(chi2, alpha=0.5).fit(Multigram_CountVector_matrix, Training_Data['Y'])

# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(selector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(selector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8932183917280608
Reliable : 0.9019382161328641


In [91]:
from sklearn.feature_selection import SelectFpr, chi2

selector = SelectFpr(chi2, alpha=0.75).fit(Multigram_CountVector_matrix, Training_Data['Y'])

# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(selector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(selector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8930016327646808
Reliable : 0.9019509570524112


## Base Model

### Count Vector

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

Base_CountVector = CountVectorizer(
    binary=True, 
    max_features=10000).fit(Training_Data['X'])

# Initializing and training the model
Base_model = LogisticRegression(max_iter=10000)
Base_model.fit(Base_CountVector.transform(Training_Data['X']), Training_Data['Y'])

# Testing the model
predictions = Base_model.predict(Base_CountVector.transform(Test_Data['X']))
result = classification_report(Test_Data['Y'], predictions, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8432551224872342
Reliable : 0.8576021374359577


### Bag Of Words

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

Base_BagOfWords = CountVectorizer(
    binary=False, 
    max_features=10000).fit(Training_Data['X'])

# Initializing and training the model
Base_model = LogisticRegression(max_iter=10000)
Base_model.fit(Base_BagOfWords.transform(Training_Data['X']), Training_Data['Y'])

# Testing the model
predictions = Base_model.predict(Base_BagOfWords.transform(Test_Data['X']))
result = classification_report(Test_Data['Y'], predictions, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8289200224944375
Reliable : 0.8471258256924163


## Logistisk Regression

Initialisere bibliotekerne brugt til den logistiske regression

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

### Count Vector

#### Raw vector

In [75]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(CountVector_matrix, Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(CountVector.transform(Test_Data['X']))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8596057521408952
Reliable : 0.8724325752793153


In [76]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(Bigram_CountVector_matrix, Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(Bigram_CountVector.transform(Test_Data['X']))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8777289084366253
Reliable : 0.8867038145766027


In [77]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(Multigram_CountVector_matrix, Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(Multigram_CountVector.transform(Test_Data['X']))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8929983174063907
Reliable : 0.9021230844005214


#### TF-IDF transformation

In [69]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(TFIDF_CountVector.transform(CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(TFIDF_CountVector.transform(CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8557202670589567
Reliable : 0.8690273538389294


In [70]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(TFIDF_Bigram_CountVector.transform(Bigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(TFIDF_Bigram_CountVector.transform(Bigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8670702858746642
Reliable : 0.877480580335696


In [71]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(TFIDF_Multigram_CountVector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(TFIDF_Multigram_CountVector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8741060738439633
Reliable : 0.8859040720266287


#### Chi feature selection

In [98]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(Chi_CountVector.transform(CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(Chi_CountVector.transform(CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.859503197881
Reliable : 0.8722369909086903


In [99]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(Chi_Bigram_CountVector.transform(Bigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(Chi_Bigram_CountVector.transform(Bigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.879103833176083
Reliable : 0.8879502000889284


In [100]:
# Initializing and training the model
LogReg_model = LogisticRegression(max_iter=10000)
LogReg_model.fit(Chi_Multigram_CountVector.transform(Multigram_CountVector_matrix), Training_Data['Y'])

# Testing the model
predictions = LogReg_model.predict(Chi_Multigram_CountVector.transform(Multigram_CountVector.transform(Test_Data['X'])))
result = classification_report(Test_Data['Y'], predictions, digits=5, output_dict=True)
print(f'Fake     : {result['fake']['f1-score']}')
print(f'Reliable : {result['reliable']['f1-score']}')

Fake     : 0.8937267949274315
Reliable : 0.9024577346354704


In [52]:
# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(CountVector_matrix, Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(CountVector.transform(Test_Data['X']))
print(classification_report(Test_Data['Y'], y_pred))

              precision    recall  f1-score   support

        fake       0.89      0.89      0.89     62165
    reliable       0.90      0.90      0.90     67838

    accuracy                           0.90    130003
   macro avg       0.90      0.90      0.90    130003
weighted avg       0.90      0.90      0.90    130003



In [46]:
# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(CountVector_matrix, Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(CountVector.transform(Test_Data['X']))
print(classification_report(Test_Data['Y'], y_pred))

              precision    recall  f1-score   support

        fake       0.86      0.86      0.86     62165
    reliable       0.87      0.88      0.87     67838

    accuracy                           0.87    130003
   macro avg       0.87      0.87      0.87    130003
weighted avg       0.87      0.87      0.87    130003



In [None]:
# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(BagOfWords_matrix, Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(BagOfWords.transform(Test_Data['X']))
print(classification_report(Test_Data['Y'], y_pred))

              precision    recall  f1-score   support

        fake       0.84      0.84      0.84     62165
    reliable       0.85      0.86      0.86     67838

    accuracy                           0.85    130003
   macro avg       0.85      0.85      0.85    130003
weighted avg       0.85      0.85      0.85    130003



In [None]:
# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(TFIDF_CountVector.transform(CountVector_matrix), Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(TFIDF_CountVector.transform(CountVector.transform(Test_Data['X'])))
print(classification_report(Test_Data['Y'], y_pred))

              precision    recall  f1-score   support

        fake       0.86      0.85      0.86     62165
    reliable       0.87      0.87      0.87     67838

    accuracy                           0.86    130003
   macro avg       0.86      0.86      0.86    130003
weighted avg       0.86      0.86      0.86    130003



In [None]:
# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(TFIDF_BagOfWords.transform(BagOfWords_matrix), Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(TFIDF_BagOfWords.transform(BagOfWords.transform(Test_Data['X'])))
print(classification_report(Test_Data['Y'], y_pred))

              precision    recall  f1-score   support

        fake       0.85      0.85      0.85     62165
    reliable       0.86      0.86      0.86     67838

    accuracy                           0.86    130003
   macro avg       0.86      0.86      0.86    130003
weighted avg       0.86      0.86      0.86    130003



In [None]:
# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(Chi_CountVector.transform(CountVector_matrix), Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(Chi_CountVector.transform(CountVector.transform(Test_Data['X'])))
print(classification_report(Test_Data['Y'], y_pred))

              precision    recall  f1-score   support

        fake       0.85      0.84      0.85     62165
    reliable       0.85      0.87      0.86     67838

    accuracy                           0.85    130003
   macro avg       0.85      0.85      0.85    130003
weighted avg       0.85      0.85      0.85    130003



In [None]:
# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(TFIDF_Bigram_CountVector.transform(Bigram_CountVector_matrix), Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(TFIDF_Bigram_CountVector.transform(Bigram_CountVector.transform(Test_Data['X'])))
print(classification_report(Test_Data['Y'], y_pred))

              precision    recall  f1-score   support

        fake       0.86      0.87      0.87     62165
    reliable       0.88      0.88      0.88     67838

    accuracy                           0.87    130003
   macro avg       0.87      0.87      0.87    130003
weighted avg       0.87      0.87      0.87    130003



## Other

In [35]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Træn modellen direkte
model = GradientBoostingClassifier(n_estimators=600,random_state=50)
model.fit(CountVector.transform(Training_Data['X'][:10000]), Training_Data['Y'][:10000])

In [39]:
# Forudsig og evaluer
y_pred = model.predict(CountVector.transform(Test_Data['X']))
print(classification_report(Test_Data['Y'], y_pred))

              precision    recall  f1-score   support

        fake       0.80      0.83      0.81     62165
    reliable       0.84      0.81      0.82     67838

    accuracy                           0.82    130003
   macro avg       0.82      0.82      0.82    130003
weighted avg       0.82      0.82      0.82    130003



In [20]:
# Split data i træning og test
X_train, X_test, y_train, y_test = train_test_split(CountVector.transform(Training_Data['X'][:20000]), Training_Data['Y'][:20000], test_size=0.2, random_state=42)

# Træn Gradient Boosting-model
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Forudsig og evaluer på testdata
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.78      0.80      0.79      1962
    reliable       0.81      0.78      0.79      2038

    accuracy                           0.79      4000
   macro avg       0.79      0.79      0.79      4000
weighted avg       0.79      0.79      0.79      4000



In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# Træn en Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(CountVector.transform(Training_Data['X'][:10000]), Training_Data['Y'][:10000])

# Brug SelectFromModel til at vælge features baseret på importance
selector = SelectFromModel(gb_model, prefit=True, threshold="mean")
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

print(f"Antal udvalgte features: {X_train_selected.shape[1]}")
print(f"Antal udvalgte features: {X_test_selected.shape[1]}")

Antal udvalgte features: 283
Antal udvalgte features: 283


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# Eksempeldata
X_train, X_test, y_train, y_test = train_test_split(CountVector.transform(Training_Data['X'][:10000]), Training_Data['Y'][:10000], test_size=0.2, random_state=42)

# Træn en Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Brug SelectFromModel til at vælge features baseret på importance
selector = SelectFromModel(gb_model, prefit=True, threshold="mean")
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

print(f"Antal udvalgte features: {X_train_selected.shape[1]}")
print(f"Antal udvalgte features: {X_test_selected.shape[1]}")


Antal udvalgte features: 287
Antal udvalgte features: 287


In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel

# Træn en Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=50)
gb_model.fit(Training_Data['X'][:1000], Training_Data['Y'][:1000])

# Brug SelectFromModel til at vælge features baseret på importance
selector = SelectFromModel(gb_model, prefit=True, threshold="mean")
X_train_selected = selector.transform(Training_Data['X'][:1000])
X_test_selected = selector.transform(Test_Data['X'][:1000])

print(f"Antal udvalgte features: {X_train_selected.shape[1]}")


ValueError: could not convert string to float: "['us', 'reward', 'inform', 'daesh', 'leader', 'ibrahim', 'ali', 'abu', 'bakr', 'al', 'baghdadi', 'increas', '<CUR>', '<NUM>', 'million', '<CUR>', '<NUM>', 'million', 'depart', 'state', 'said', 'releas', 'friday', 'washington', 'sputnik', 'releas', 'note', 'daesh', 'terrorist', 'group', 'name', 'al', 'baghdadi', 'caliph', 'declar', 'call', 'islam', 'caliph', '<DATE>', 'take', 'part', 'iraq', 'syria', 'today', 'u', 'depart', 'state', 'reward', 'justic', 'program', 'increas', '<CUR>', '<NUM>', 'million', 'reward', 'offer', 'inform', 'lead', 'locat', 'arrest', 'convict', 'islam', 'state', 'iraq', 'levant', 'isil', 'daesh', 'leader', 'abu', 'bakr', 'al', 'baghdadi', 'also', 'known', 'ibrahim', 'awwad', 'ibrahim', 'ali', 'al', 'badri', 'also', 'known', 'abu', 'du', 'releas', 'state', 'al', 'baghdadi', 'daesh', 'respons', 'thousand', 'civilian', 'death', 'middl', 'east', 'includ', 'death', 'civilian', 'hostag', 'unit', 'state', 'unit', 'kingdom', 'japan', 'releas', 'ad']"

## SVM Model

In [49]:
vectorizer = CountVectorizer(
    analyzer='word', 
    ngram_range=(1, 1), 
    binary=True, 
    max_features=10_000).fit(Training_Data['X'])

In [86]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Opret og træn SVM-modellen
svm_model = SVC(kernel='linear')  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(
    vectorizer.transform(Training_Data['X'][:5000]), 
    Training_Data['Y'][:5000])

# Evaluer modellen
y_pred = svm_model.predict(vectorizer.transform(Test_Data['X'][:5000]))
print(classification_report(Test_Data['Y'][:5000], y_pred))

              precision    recall  f1-score   support

        fake       0.76      0.78      0.77      2277
    reliable       0.82      0.80      0.81      2723

    accuracy                           0.79      5000
   macro avg       0.79      0.79      0.79      5000
weighted avg       0.79      0.79      0.79      5000



In [88]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Opret og træn SVM-modellen
svm_model = SVC(kernel='linear')  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(
    vectorizer.transform(Training_Data['X']), 
    Training_Data['Y'])

# Evaluer modellen
y_pred = svm_model.predict(vectorizer.transform(Test_Data['X']))
print(classification_report(Test_Data['Y'], y_pred))

: 

: 

In [66]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Opret og træn SVM-modellen
svm_model = SVC(kernel='rbf',C=2)  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(
    vectorizer.transform(Training_Data['X'][:5000]), 
    Training_Data['Y'][:5000])

# Evaluer modellen
y_pred = svm_model.predict(vectorizer.transform(Test_Data['X'][:5000]))
print(classification_report(Test_Data['Y'][:5000], y_pred))

              precision    recall  f1-score   support

        fake       0.75      0.77      0.76      2277
    reliable       0.81      0.79      0.80      2723

    accuracy                           0.78      5000
   macro avg       0.78      0.78      0.78      5000
weighted avg       0.78      0.78      0.78      5000



In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Opret og træn SVM-modellen
svm_model = SVC(kernel='rbf',C=2)  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(
    vectorizer.transform(Training_Data['X']), 
    Training_Data['Y'])

# Evaluer modellen
y_pred = svm_model.predict(vectorizer.transform(Test_Data['X']))
print(classification_report(Test_Data['Y'], y_pred))

In [79]:
vectorizer = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1, 1), 
    binary=True, 
    max_features=10000).fit(Training_Data['X'])

# Opret og træn SVM-modellen
svm_model = SVC(kernel='rbf',C=2)  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(
    vectorizer.transform(Training_Data['X'][:5000]), 
    Training_Data['Y'][:5000])

# Evaluer modellen
y_pred = svm_model.predict(vectorizer.transform(Test_Data['X'][:5000]))
print(classification_report(Test_Data['Y'][:5000], y_pred))

              precision    recall  f1-score   support

        fake       0.77      0.79      0.78      2277
    reliable       0.82      0.81      0.81      2723

    accuracy                           0.80      5000
   macro avg       0.80      0.80      0.80      5000
weighted avg       0.80      0.80      0.80      5000



In [None]:
vectorizer = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1, 1), 
    binary=True, 
    max_features=10000).fit(Training_Data['X'])

# Opret og træn SVM-modellen
svm_model = SVC(kernel='rbf',C=2)  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(
    vectorizer.transform(Training_Data['X']), 
    Training_Data['Y'])

# Evaluer modellen
y_pred = svm_model.predict(vectorizer.transform(Test_Data['X']))
print(classification_report(Test_Data['Y'], y_pred))

## Logistisk Regressions Model

In [28]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(vectorizer.transform(Training_Data['X'][:5000]), Training_Data['Y'][:5000])
feature_importances = model.coef_
print(feature_importances)

# Evaluer modellen
y_pred = model.predict(vectorizer.transform(Test_Data['X'][:10000]))
print(classification_report(Test_Data['Y'][:10000], y_pred))

[[-0.00332602  0.0097078  -0.06030206 ... -0.01712788 -0.03528295
  -0.03188948]]
              precision    recall  f1-score   support

        fake       0.75      0.78      0.77      4736
    reliable       0.80      0.77      0.78      5264

    accuracy                           0.78     10000
   macro avg       0.78      0.78      0.78     10000
weighted avg       0.78      0.78      0.78     10000



In [31]:
from sklearn.linear_model import LogisticRegression

# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(vectorizer.transform(Training_Data['X'][:5000]), Training_Data['Y'][:5000])

# Evaluer modellen
y_pred = log_reg_model.predict(vectorizer.transform(Test_Data['X'][:10000]))
print(classification_report(Test_Data['Y'][:10000], y_pred))

              precision    recall  f1-score   support

        fake       0.76      0.79      0.77      4736
    reliable       0.80      0.77      0.79      5264

    accuracy                           0.78     10000
   macro avg       0.78      0.78      0.78     10000
weighted avg       0.78      0.78      0.78     10000



In [71]:
from sklearn.linear_model import LogisticRegression

# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(vectorizer.transform(Training_Data['X']), Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(vectorizer.transform(Test_Data['X'][:10000]))
print(classification_report(Test_Data['Y'][:10000], y_pred))

              precision    recall  f1-score   support

        fake       0.85      0.84      0.84      4728
    reliable       0.85      0.86      0.86      5272

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1, 1), 
    binary=True, 
    max_features=10000).fit(Training_Data['X'])


# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(vectorizer.transform(Training_Data['X']), Training_Data['Y'])

# Evaluer modellen
y_pred = log_reg_model.predict(vectorizer.transform(Test_Data['X'][:10000]))
print(classification_report(Test_Data['Y'][:10000], y_pred))

              precision    recall  f1-score   support

        fake       0.84      0.84      0.84      4728
    reliable       0.85      0.86      0.86      5272

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Eksempeldata
tokens = dataset['cleaned content'][:5000]
labels = dataset['type'][:5000]

# Konverter tokens til numeriske features
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), binary=True, max_features=10000)
X = vectorizer.fit_transform(tokens)

# Opdel i trænings- og testdatasæt
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=50, stratify=labels)

vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aap', ..., 'эту', 'является', 'яндекса'],
      shape=(10000,), dtype=object)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Opret og træn SVM-modellen
svm_model = SVC(kernel='rbf',C=2)  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(X_train, y_train)

# Evaluer modellen
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        fake       0.78      0.90      0.84      1045
    reliable       0.87      0.73      0.79       955

    accuracy                           0.82      2000
   macro avg       0.83      0.81      0.81      2000
weighted avg       0.82      0.82      0.82      2000



In [316]:
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier(hidden_layer_sizes=(500,100,50,50,50,50,50,50,50,50), max_iter=100)
mlp_model.fit(X_train, y_train)

# Evaluer modelens performance
y_pred = mlp_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.79      0.85      0.82      1049
    reliable       0.82      0.74      0.78       951

    accuracy                           0.80      2000
   macro avg       0.80      0.80      0.80      2000
weighted avg       0.80      0.80      0.80      2000



In [317]:
# Opret og træn SVM-modellen
svm_model = SVC(kernel='sigmoid',C=0.5)  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(X_train, y_train)

# Evaluer modellen
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        fake       0.72      0.74      0.73      1049
    reliable       0.71      0.68      0.70       951

    accuracy                           0.72      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.72      0.72      0.72      2000



In [318]:
# Opret og træn SVM-modellen
svm_model = SVC(kernel='linear',C=0.01)  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(X_train, y_train)

# Evaluer modellen
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.76      0.88      0.82      1049
    reliable       0.84      0.69      0.76       951

    accuracy                           0.79      2000
   macro avg       0.80      0.79      0.79      2000
weighted avg       0.80      0.79      0.79      2000



In [320]:
from sklearn.linear_model import LogisticRegression

# Opret og træn modellen
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Evaluer modellen
y_pred = log_reg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.76      0.80      0.78      1049
    reliable       0.77      0.72      0.74       951

    accuracy                           0.76      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.76      0.76      0.76      2000



In [321]:
from sklearn.naive_bayes import MultinomialNB

# 3. Opret og træn en MultinomialNB-model
model = MultinomialNB()
model.fit(X_train, y_train)

# 4. Lav forudsigelser på testdatasættet
y_pred = model.predict(X_test)

# 5. Evaluer modelens performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.76      0.83      0.79      1049
    reliable       0.79      0.71      0.75       951

    accuracy                           0.77      2000
   macro avg       0.78      0.77      0.77      2000
weighted avg       0.78      0.77      0.77      2000



In [131]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1,0.01,0.001],
    'kernel': ['linear']
}
grid = GridSearchCV(SVC(), param_grid, refit=True)
grid.fit(X_train, y_train)
print(grid.best_params_)


{'C': 0.01, 'kernel': 'linear'}


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Eksempeldata
tokens = dataset['cleaned content'][:5000]
labels = dataset['type'][:5000]

# Konverter tokens til numeriske features
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), binary=True, max_features=10000)
X = vectorizer.fit_transform(tokens)

# Opdel i trænings- og testdatasæt
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aap', ..., 'эту', 'является', 'яндекса'],
      shape=(10000,), dtype=object)

In [18]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Opret og træn SVM-modellen
svm_model = SVC(kernel='rbf',C=2)  # 'linear' eller 'rbf' afhængigt af dine data
svm_model.fit(X_train, y_train)

# Evaluer modellen
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        fake       0.82      0.84      0.83       536
    reliable       0.81      0.79      0.80       464

    accuracy                           0.82      1000
   macro avg       0.82      0.81      0.82      1000
weighted avg       0.82      0.82      0.82      1000



In [261]:
import numpy as np

# Indlæs GloVe-filen
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]  # Ordet (token)
            vector = np.asarray(values[1:], dtype='float32')  # Vektorens værdier
            embeddings_index[word] = vector
    return embeddings_index

# Brug en sti til den hentede fil (f.eks. "glove.6B.100d.txt")
file_path = "glove.6B.300d.txt"  # Udskift med din sti
embeddings_index = load_glove_embeddings(file_path)

# Eksempel: Hent vektoren for et ord
print(embeddings_index.get("king"))  # Giver f.eks. en vektor for ordet "king"


[ 0.0033901 -0.34614    0.28144    0.48382    0.59469    0.012965
  0.53982    0.48233    0.21463   -1.0249    -0.34788   -0.79001
 -0.15084    0.61374    0.042811   0.19323    0.25462    0.32528
  0.05698    0.063253  -0.49439    0.47337   -0.16761    0.045594
  0.30451   -0.35416   -0.34583   -0.20118    0.25511    0.091111
  0.014651  -0.017541  -0.23854    0.48215   -0.9145    -0.36235
  0.34736    0.028639  -0.027065  -0.036481  -0.067391  -0.23452
 -0.13772    0.33951    0.13415   -0.1342     0.47856   -0.1842
  0.10705   -0.45834   -0.36085   -0.22595    0.32881   -0.13643
  0.23128    0.34269    0.42344    0.47057    0.479      0.074639
  0.3344     0.10714   -0.13289    0.58734    0.38616   -0.52238
 -0.22028   -0.072322   0.32269    0.44226   -0.037382   0.18324
  0.058082   0.26938    0.36202    0.13983    0.016815  -0.34426
  0.4827     0.2108     0.75618   -0.13092   -0.025741   0.43391
  0.33893   -0.16438    0.26817    0.68774    0.311     -0.2509
  0.0027749 -0.39809   

In [268]:
# Opret en matrix til sætningernes embeddings
def create_embedding_matrix(tokens, embeddings_index, embedding_dim):
    embedding_matrix = []
    for sentence in tokens:
        sentence_embedding = []
        for word in sentence:
            if word in embeddings_index:
                sentence_embedding.append(embeddings_index[word])
            else:
                sentence_embedding.append(np.zeros(embedding_dim))  # Nulvektor til ukendte ord
        embedding_matrix.append(np.mean(sentence_embedding, axis=0))  # Gennemsnit af embeddings i sætningen
    return np.array(embedding_matrix)

tokens = dataset['cleaned content'][:10000]
labels = dataset['type'][:10000]

embedding_dim = 300  # Dimensionen på den GloVe-fil, du bruger (f.eks. 100d)
X = create_embedding_matrix(tokens, embeddings_index, embedding_dim)
print(X.shape)  # Antallet af sætninger og dimensionen af embeddings


(10000, 300)


In [269]:
# Opdel i trænings- og testdatasæt
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Opret og træn en logistisk regressionsmodel
model = LogisticRegression(C=5)
model.fit(X_train, y_train)

# Evaluer modelens performance
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        fake       0.59      0.62      0.60      1049
    reliable       0.56      0.53      0.54       951

    accuracy                           0.58      2000
   macro avg       0.57      0.57      0.57      2000
weighted avg       0.58      0.58      0.58      2000



In [273]:
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier(hidden_layer_sizes=(512,256,128,64,32,16,8,4,2), max_iter=10000)
mlp_model.fit(X_train, y_train)

# Evaluer modelens performance
y_pred = mlp_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.63      0.48      0.55      1049
    reliable       0.55      0.68      0.61       951

    accuracy                           0.58      2000
   macro avg       0.59      0.58      0.58      2000
weighted avg       0.59      0.58      0.57      2000

