In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import spacy
import nltk
from nltk.corpus import words, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import contractions
from tqdm import tqdm
tqdm.pandas()

In [2]:
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [3]:
ENGLISH_WORDS = set(w.lower() for w in words.words())
STOP_WORDS = set(w for w in stopwords.words('english'))

# Load & Inspect Data

In [4]:
DATA_PATH = 'data/fakeNewsClassificationData.csv'

In [5]:
raw_data = pd.read_csv(DATA_PATH)

In [6]:
display(raw_data.head())

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
print("Size of dataset:", raw_data.shape[0])

Size of dataset: 20800


## Sample Record

In [8]:
for c in raw_data.columns:
    print(c, ":", raw_data.loc[5][c])
    print()

title : Jackie Mason: Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (Exclusive Video) - Breitbart

author : Daniel Nussbaum

text : In these trying times, Jackie Mason is the Voice of Reason. [In this week’s exclusive clip for Breitbart News, Jackie discusses the looming threat of North Korea, and explains how President Donald Trump could win the support of the Hollywood left if the U. S. needs to strike first.  “If he decides to bomb them, the whole country will be behind him, because everybody will realize he had no choice and that was the only thing to do,” Jackie says. “Except the Hollywood left. They’ll get nauseous. ” “[Trump] could win the left over, they’ll fall in love with him in a minute. If he bombed them for a better reason,” Jackie explains. “Like if they have no transgender toilets. ” Jackie also says it’s no surprise that Hollywood celebrities didn’t support Trump’s strike on a Syrian airfield this month. “They were infuriated,” he say

## Handle Missing Data

In [9]:
print("Count of records with missing data:")
print(raw_data.isna().sum())

Count of records with missing data:
title      558
author    1957
text        39
label        0
dtype: int64


In [10]:
prepped_data = raw_data.dropna(subset=['text'])
print(prepped_data.isna().sum())

title      558
author    1918
text         0
label        0
dtype: int64


## Handle Duplicate Data

In [11]:
prepped_data.duplicated().sum()

109

In [12]:
prepped_data = prepped_data.drop_duplicates()

In [13]:
prepped_data.shape

(20652, 4)

## Check for Meta-data Leakage

In [1]:
# for idx in list(prepped_data['title'][prepped_data['title'].isna()].index):
#     print(prepped_data.loc[idx])
#     print()

In [15]:
prepped_data['label'][prepped_data['title'].isna()].unique()

array([1], dtype=int64)

* It appears that all the records with missing title happen to be fake news. So, the absence of a title could be a characterisitic of fake news.

In [14]:
c_fake = 0
c_real = 0
for i,(author,title,label) in enumerate(prepped_data[['author','title','label']].values):
    if 'anonymous' in str(title).lower():
        # print(author, "->", title, "->", label)
        if label==1:
            c_fake += 1
        else:
            c_real += 1

In [15]:
c_fake, c_real

(27, 3)

* Majority of the records in which the title contained the word 'anonymous', the news turned out to be fake. However, the number is very small compared to the total number of records.

In [16]:
c_fake = 0
c_real = 0
for i,(author,title,label) in enumerate(prepped_data[['author','title','label']].values):
    if re.match(r'\bno\b|\bnone\b|\bnil\b', str(title).lower()):
        # print(author, "->", title, "->", label)
        if label==1:
            c_fake += 1
        else:
            c_real += 1

In [17]:
c_fake, c_real

(21, 21)

In [18]:
prepped_data['label'][prepped_data['author'].isna()].value_counts()

label
1    1867
0      26
Name: count, dtype: int64

* Majority of the records with missing author name are fake, while a small number of them are real too. So it cannot be a definitive trait.

In [19]:
idxs = []
for i,author in enumerate(prepped_data['author'].values):
    if 'anonymous' in str(author).lower() or  re.match(r'\bno\b|\bnone\b|\bnil\b', str(author).lower()):
        print(author)
        idxs.append(i)

Anonymous Coward (UID 12781064)
Anonymous
Anonymous
Anonymous Coward (UID 11897093)
Anonymous
Anonymous Coward (UID 73268493)
Anonymous
Anonymous
Anonymous Coward (UID 19747754)
Anonymous
Anonymous Coward (UID 26968733)
No Author
Anonymous Coward (UID 47910020)
Anonymous Coward (UID 73270245)
Anonymous Coward (UID 73268493)
Anonymous
Anonymous
No Author
Anonymous
Anonymous Coward (UID 73270427)
No Author
Anonymous: World War 3 Is On The Horizon (In 2016) – Collective Evolution
No Author
Anonymous
Anonymous
Anonymous
No Author
Anonymous Coward (UID 58307359)
Anonymous
Anonymous Coward (UID 73270620)
World War 3 News 2016: Anonymous Declares WW3 'on the Horizon'
Anonymous
Anonymous
Anonymous Activist
Anonymous
Anonymous Coward (UID 18807137)
Anonymous Coward (UID 73271258)
Anonymous
Anonymous
Anonymous Coward (UID 73270906)
Anonymous
No Author
Anonymous
No Author
Anonymous
No Author
Anonymous
Anonymous: There Is No One Way To Live On This Planet, But We Can Be Harmonious – Collective Evo

In [20]:
print(len(idxs))

116


In [21]:
prepped_data.iloc[idxs]['label'].unique()

array([1], dtype=int64)

* Other than explicitly missing values for author column, there are records with names that contain the word 'anonymous' and 'no author', these have been found to be all fake.

In [30]:
def metadata_stats(df, col):
    # col becomes index
    stats = df.groupby(col).agg(
        total=('text', 'size'),
        fakeness=('label', 'mean'))
    # changing col back to a column and sorting groupby table
    stats = stats.reset_index().sort_values('total', ascending=False)
    return stats

In [31]:
author_stats = metadata_stats(prepped_data, 'author')

In [32]:
author_stats.head(20)

Unnamed: 0,author,total,fakeness
2944,Pam Key,243,0.004115
3929,admin,192,1.0
1762,Jerome Hudson,166,0.0
724,Charlie Spiering,141,0.0
1857,John Hayward,140,0.0
2090,Katherine Rodriguez,124,0.0
3845,Warner Todd Huston,122,0.0
1520,Ian Hanchett,119,0.0
577,Breitbart News,118,0.0
914,Daniel Nussbaum,112,0.0


In [35]:
author_leakage = author_stats[(author_stats['total']>50) & ((author_stats['fakeness']<0.05) | (author_stats['fakeness']>0.95))]
print("Authors who are mostly found for one class:")
display(author_leakage)

Authors who are mostly found for one class:


Unnamed: 0,author,total,fakeness
2944,Pam Key,243,0.004115
3929,admin,192,1.0
1762,Jerome Hudson,166,0.0
724,Charlie Spiering,141,0.0
1857,John Hayward,140,0.0
2090,Katherine Rodriguez,124,0.0
3845,Warner Todd Huston,122,0.0
1520,Ian Hanchett,119,0.0
577,Breitbart News,118,0.0
914,Daniel Nussbaum,112,0.0


* We can see that there are several authors corresponding to whom there are either 100% fake news or 100% real news. So, if the author data is fed to a classifier along with the text, the mdoel may simply learn based on the author names instead of actual text and language features. Since, clearly, there is leakage from the author data, we will not be including it for further processing.

In [44]:
title_stats = metadata_stats(prepped_data, 'title')
title_stats.head(10)

Unnamed: 0,title,total,fakeness
15400,The Dark Agenda Behind Globalism And Open Borders,5,1.0
6764,Get Ready For Civil Unrest: Survey Finds That ...,5,1.0
9609,Let’s Be Clear – A Vote For Warmonger Hillary ...,4,1.0
12658,Public vs. Media on War,4,1.0
15490,The Fix Is In: NBC Affiliate Accidentally Post...,4,1.0
10400,Michael Moore Owes Me $4.99,4,1.0
18350,What to Cook This Week - The New York Times,4,0.0
9469,Las imágenes libres de derechos más destacadas...,4,1.0
15833,The U.S. National Bird Is Now a Drone,4,1.0
18750,Will Barack Obama Delay Or Suspend The Electio...,4,1.0


In [46]:
title_stats.tail(10)

Unnamed: 0,title,total,fakeness
6634,"Gary Johnson Goes Zombie, Tries to Bite Reporters",1,1.0
6633,Gary Johnson Equates Syria Deaths Caused by As...,1,0.0
6632,Gary Cohn Relaunches War on Coal: Fuel from Am...,1,0.0
6631,"Garry Marshall, ‘Pretty Woman’ Director, Dies ...",1,0.0
6630,Garrison Keillor Turns Out the Lights on Lake ...,1,0.0
6629,Garlic: 12 Serious Health Benefits | Undergrou...,1,1.0
6628,Garlic Beats Drug In Detoxifying Lead Safely F...,1,1.0
6627,Gardaí Strike Negotiations Get Off To Bad Star...,1,1.0
6626,Gardasil is a Decision We Will Always Regret,1,1.0
19763,🚨Bill Clinton and Hillary Lolita Express Pedop...,1,1.0


In [50]:
title_stats[['total','fakeness']].value_counts()

total  fakeness
1      0.0         10377
       1.0          9078
2      1.0           261
3      1.0            31
4      1.0            11
2      0.0             3
5      1.0             2
4      0.0             1
Name: count, dtype: int64

* Majority of the titles are unique. The few that are duplicates are of records belonging to the same class, but the numbers aren't significant to allow the model to cheat. Hence, we will retain the titles and concatenate them to the text.

In [53]:
prepped_data.drop(columns='author', inplace=True)

In [57]:
prepped_data['full_text'] = prepped_data['title'].fillna("") + " " + prepped_data['text']

In [58]:
prepped_data.head()

Unnamed: 0,title,text,label,full_text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


## Text Length Statistics

In [59]:
prepped_data['char_count'] = prepped_data['full_text'].astype(str).map(len)

In [60]:
prepped_data['char_count'].describe()

count     20652.000000
mean       4628.469543
std        5129.856542
min           2.000000
25%        1712.750000
50%        3444.500000
75%        6359.000000
max      143035.000000
Name: char_count, dtype: float64

In [62]:
print("Shortest Text:")
print(prepped_data.loc[prepped_data['char_count'].idxmin(), ['text', 'char_count','label']])

Shortest Text:
text          f
char_count    2
label         1
Name: 786, dtype: object


In [63]:
print("Longest Text:")
print(prepped_data.loc[prepped_data['char_count'].idxmax(), ['text', 'char_count','label']])

Longest Text:
text          Заседание Международного дискуссионного клуба ...
char_count                                               143035
label                                                         1
Name: 19764, dtype: object


# Preprocess Data

We will create new columns:
1. 'text_clean': lowercased, urls removed, contractions fixed, punctuations removed
2. 'tokens': list of lemmatized tokens after removing stopwords and non-english words
3. 'text_lemmatized': tokens joined back after lemmatization for TF-IDF
4. 'token_count'


We will makes these for both text and full_text.

In [64]:
def fix_contractions_safe(s):
  try:
    return contractions.fix(s)
  except:
    return s
def clean_text(s):
    s = str(s)
    print(s[:10])
    # normalizing quotations
    quote_maps = {'“':'"', '”':'"', "‘":"'", "’":"'"}
    for curly_quote,quote in quote_maps.items():
        s = s.replace(curly_quote, quote)
    # remove non-ASCII
    s = re.sub(r'[^\x00-\x7F]+', ' ', s)
    # fixing contractions
    s = fix_contractions_safe(s)
    # removing urls
    s = re.sub(r'(https?:\/\/)?([\w\.-]+)\.([A-Za-z\.]{2,6})([\/\w \.-]*)*\/?', ' ', s)
    # removing punctuations (all non-alphanumeric characters and non-whitespaces)
    s = re.sub(r'[^a-zA-Z0-9 ]', ' ', s)
    # removing unnecessary whitespaces, newlines, tabspaces, etc.
    s = re.sub(r'\s+', ' ', s).strip()
    # returning lowercase text
    return s.lower()

In [None]:
prepped_data['text_clean'] = prepped_data['text'].progress_map(clean_text)

In [None]:
prepped_data['full_text_clean'] = prepped_data['full_text'].progress_map(clean_text)

In [67]:
display(prepped_data[['text','text_clean','full_text', 'full_text_clean']].head(10))

Unnamed: 0,text,text_clean,full_text,full_text_clean
0,House Dem Aide: We Didn’t Even See Comey’s Let...,house dem aide we did not even see comey s let...,House Dem Aide: We Didn’t Even See Comey’s Let...,house dem aide we did not even see comey s let...
1,Ever get the feeling your life circles the rou...,ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",flynn hillary clinton big woman on campus brei...
2,"Why the Truth Might Get You Fired October 29, ...",why the truth might get you fired october 29 2...,Why the Truth Might Get You Fired Why the Trut...,why the truth might get you fired why the trut...
3,Videos 15 Civilians Killed In Single US Airstr...,videos 15 civilians killed in single us airstr...,15 Civilians Killed In Single US Airstrike Hav...,15 civilians killed in single us airstrike hav...
4,Print \nAn Iranian woman has been sentenced to...,print an iranian woman has been sentenced to s...,Iranian woman jailed for fictional unpublished...,iranian woman jailed for fictional unpublished...
5,"In these trying times, Jackie Mason is the Voi...",in these trying times jackie mason is the voic...,Jackie Mason: Hollywood Would Love Trump if He...,jackie mason hollywood would love trump if he ...
6,Ever wonder how Britain’s most iconic pop pian...,ever wonder how britain s most iconic pop pian...,Life: Life Of Luxury: Elton John’s 6 Favorite ...,life life of luxury elton john s 6 favorite sh...
7,"PARIS — France chose an idealistic, traditi...",paris france chose an idealistic traditional c...,Benoît Hamon Wins French Socialist Party’s Pre...,beno t hamon wins french socialist party s pre...
8,Donald J. Trump is scheduled to make a highly ...,donald j trump is scheduled to make a highly a...,Excerpts From a Draft Script for Donald Trump’...,excerpts from a draft script for donald trump ...
9,A week before Michael T. Flynn resigned as nat...,a week before michael t flynn resigned as nati...,"A Back-Channel Plan for Ukraine and Russia, Co...",a back channel plan for ukraine and russia cou...


In [68]:
def tokenize_and_lemmatize(s):
    print(s[:10])
    tokens = word_tokenize(s)
    tokens = [token for token in tokens 
              if token in ENGLISH_WORDS 
              and token not in STOP_WORDS
              and len(token)>2]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [None]:
prepped_data['tokens'] = prepped_data['text_clean'].progress_map(tokenize_and_lemmatize)

In [None]:
prepped_data['full_tokens'] = prepped_data['full_text_clean'].progress_map(tokenize_and_lemmatize)

In [71]:
prepped_data[['text', 'tokens', 'full_text', 'full_tokens']].head(10)

Unnamed: 0,text,tokens,full_text,full_tokens
0,House Dem Aide: We Didn’t Even See Comey’s Let...,"[house, aide, even, see, letter, jason, darrel...",House Dem Aide: We Didn’t Even See Comey’s Let...,"[house, aide, even, see, letter, jason, house,..."
1,Ever get the feeling your life circles the rou...,"[ever, get, feeling, life, roundabout, rather,...","FLYNN: Hillary Clinton, Big Woman on Campus - ...","[hillary, clinton, big, woman, campus, ever, g..."
2,"Why the Truth Might Get You Fired October 29, ...","[truth, might, get, fired, october, tension, i...",Why the Truth Might Get You Fired Why the Trut...,"[truth, might, get, fired, truth, might, get, ..."
3,Videos 15 Civilians Killed In Single US Airstr...,"[single, rate, american, higher, engaged, acti...",15 Civilians Killed In Single US Airstrike Hav...,"[single, single, rate, american, higher, engag..."
4,Print \nAn Iranian woman has been sentenced to...,"[print, iranian, woman, six, prison, iran, rev...",Iranian woman jailed for fictional unpublished...,"[iranian, woman, fictional, unpublished, story..."
5,"In these trying times, Jackie Mason is the Voi...","[trying, time, mason, voice, reason, week, exc...",Jackie Mason: Hollywood Would Love Trump if He...,"[mason, hollywood, would, love, trump, bombed,..."
6,Ever wonder how Britain’s most iconic pop pian...,"[ever, wonder, britain, iconic, pop, pianist, ...",Life: Life Of Luxury: Elton John’s 6 Favorite ...,"[life, life, luxury, john, favorite, shark, st..."
7,"PARIS — France chose an idealistic, traditi...","[paris, chose, idealistic, traditional, candid...",Benoît Hamon Wins French Socialist Party’s Pre...,"[beno, french, socialist, party, presidential,..."
8,Donald J. Trump is scheduled to make a highly ...,"[donald, trump, make, highly, visit, church, s...",Excerpts From a Draft Script for Donald Trump’...,"[draft, script, donald, trump, black, church, ..."
9,A week before Michael T. Flynn resigned as nat...,"[week, michael, resigned, national, security, ...","A Back-Channel Plan for Ukraine and Russia, Co...","[back, channel, plan, russia, courtesy, trump,..."


In [74]:
prepped_data['text_lemmatized'] = prepped_data['tokens'].map(lambda tokens: " ".join(tokens))

In [75]:
prepped_data['full_text_lemmatized'] = prepped_data['full_tokens'].map(lambda tokens: " ".join(tokens))

In [72]:
prepped_data['token_count'] = prepped_data['tokens'].map(len)

In [73]:
prepped_data['full_token_count'] = prepped_data['full_tokens'].map(len)

In [76]:
prepped_data.head()

Unnamed: 0,title,text,label,full_text,char_count,text_clean,full_text_clean,tokens,full_tokens,token_count,full_token_count,text_lemmatized,full_text_lemmatized
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...,5012,house dem aide we did not even see comey s let...,house dem aide we did not even see comey s let...,"[house, aide, even, see, letter, jason, darrel...","[house, aide, even, see, letter, jason, house,...",320,326,house aide even see letter jason darrell octob...,house aide even see letter jason house aide ev...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",4216,ever get the feeling your life circles the rou...,flynn hillary clinton big woman on campus brei...,"[ever, get, feeling, life, roundabout, rather,...","[hillary, clinton, big, woman, campus, ever, g...",287,292,ever get feeling life roundabout rather straig...,hillary clinton big woman campus ever get feel...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...,7726,why the truth might get you fired october 29 2...,why the truth might get you fired why the trut...,"[truth, might, get, fired, october, tension, i...","[truth, might, get, fired, truth, might, get, ...",476,480,truth might get fired october tension intellig...,truth might get fired truth might get fired oc...
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...,3301,videos 15 civilians killed in single us airstr...,15 civilians killed in single us airstrike hav...,"[single, rate, american, higher, engaged, acti...","[single, single, rate, american, higher, engag...",192,193,single rate american higher engaged active com...,single single rate american higher engaged act...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...,1032,print an iranian woman has been sentenced to s...,iranian woman jailed for fictional unpublished...,"[print, iranian, woman, six, prison, iran, rev...","[iranian, woman, fictional, unpublished, story...",62,71,print iranian woman six prison iran revolution...,iranian woman fictional unpublished story woma...


### Deleting Records with no valid tokens

In [77]:
prepped_data.describe()

Unnamed: 0,label,char_count,token_count,full_token_count
count,20652.0,20652.0,20652.0,20652.0
mean,0.497046,4628.469543,298.989057,304.664149
std,0.500003,5129.856542,335.125542,335.515437
min,0.0,2.0,0.0,0.0
25%,0.0,1712.75,106.0,111.0
50%,0.0,3444.5,220.0,226.0
75%,1.0,6359.0,415.0,421.0
max,1.0,143035.0,9279.0,9284.0


In [78]:
prepped_data.loc[prepped_data['full_token_count']==0]

Unnamed: 0,title,text,label,full_text,char_count,text_clean,full_text_clean,tokens,full_tokens,token_count,full_token_count,text_lemmatized,full_text_lemmatized
47,"СМИ Сербии приписали россиянам ""подготовку тер...",0 комментариев 0 поделились Фото: AP \nОднако ...,1,"СМИ Сербии приписали россиянам ""подготовку тер...",3651,0 0 ap n1 26 27 16 16 20 viber whatsapp 27 27,0 0 ap n1 26 27 16 16 20 viber whatsapp 27 27,[],[],0,0,,
371,"Путин рассказал, когда в Крыму решат проблему ...",0 комментариев 7 поделились \nОтвечая на соотв...,1,"Путин рассказал, когда в Крыму решат проблему ...",2013,0 7 23 2015 2020 40 4 5 2 5 25,0 7 23 2015 2020 40 4 5 2 5 25,[],[],0,0,,
492,Казахстан на страже ядерной безопасности | Нов...,В ноябре 2016 г. Мажилис Парламента Республики...,1,Казахстан на страже ядерной безопасности | Нов...,6525,2016 2006 2011 2015 2015 2016 3 2016 2016 2018...,2016 2006 2011 2015 2015 2016 3 2016 2016 2018...,[],[],0,0,,
580,,Ludicrous...,1,Ludicrous...,13,,,[],[],0,0,,
650,Очередная автоколонна МЧС с гуманитарной помощ...,19 МЧС направило 57-ю по счёту автоколонну с ...,1,Очередная автоколонна МЧС с гуманитарной помощ...,757,19 57 40 440 2014 56 64,19 57 40 440 2014 56 64,[],[],0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18556,Минобороны России: Эвакуация жителей Алеппо бы...,"\nКонашенков пояснил, что представителями си...",1,Минобороны России: Эвакуация жителей Алеппо бы...,2188,40 afp 2011 2011 220 2014 17,40 afp 2011 2011 220 2014 17,[],[],0,0,,
19009,Опрос: Россияне одобряют действия президента П...,0 комментариев 4 поделились Фото: AP \nКак пок...,1,Опрос: Россияне одобряют действия президента П...,2768,0 4 ap 78 7 81 8 53 6 58 3 39 7 45 5 42 1 46 4...,0 4 ap 78 7 81 8 53 6 58 3 39 7 45 5 42 1 46 4...,[],[],0,0,,
19734,Москва ввела в эксплуатацию систему по борьбе ...,"\nКак пишет Коммерсант, система под название...",1,Москва ввела в эксплуатацию систему по борьбе ...,2801,5 50 934 60 26 100 36 volkswagen 12 36 4 06 00...,5 50 934 60 26 100 36 volkswagen 12 36 4 06 00...,[],[],0,0,,
20168,Стала известна возможная причина взрыва дома в...,Фото: © Пресс-служба МЧС по Рязанской области ...,1,Стала известна возможная причина взрыва дома в...,2309,23 10 1 279 61 162 31 pravda ru,23 10 1 279 61 162 31 pravda ru,[],[],0,0,,


In [79]:
prepped_data = prepped_data.drop(index=prepped_data.loc[prepped_data['full_token_count']==0].index)
prepped_data.shape

(20546, 13)

In [80]:
prepped_data.describe()

Unnamed: 0,label,char_count,token_count,full_token_count
count,20546.0,20546.0,20546.0,20546.0
mean,0.494451,4639.159252,300.531588,306.235958
std,0.499981,5138.287902,335.298338,335.663579
min,0.0,5.0,0.0,1.0
25%,0.0,1716.0,107.0,112.0
50%,0.0,3467.0,222.0,227.0
75%,1.0,6370.75,416.0,422.0
max,1.0,143035.0,9279.0,9284.0


In [81]:
prepped_data.to_csv('data/preprocessed_data.csv', index=False)

In [2]:
prepped_data = pd.read_csv('data/preprocessed_data.csv')

In [3]:
prepped_data[['full_text', 'full_tokens', 'full_text_lemmatized', 'label']].to_csv('data/fulltext_preprocessed_data.csv', index=False)

In [11]:
prepped_data2 = prepped_data[['text', 'tokens', 'text_lemmatized', 'label']]
prepped_data2 = prepped_data2.drop(index=prepped_data[prepped_data['token_count']==0].index)
prepped_data2.shape

(20459, 4)

In [12]:
prepped_data.to_csv('data/text_preprocessed_data.csv', index=False)