# Preprocess

In [2]:
import pandas as pd

In [16]:
def find_long_articles(df, col, min_len, max_len):
    """
    Returns the DataFrame with content length more than
    specific charaters.
    """
    df['length'] = df[col].str.len()
    return df[(df.length > min_len) & (df.length < max_len)]

In [4]:
def preprocess(Data, col):
    """
    Strip or replace unnecessary details.
    """
    # Remove ampersand
    Data[col] = Data[col].str.replace(r'[%$#]*', '')
    # Replace & by 'and'
    Data[col] = Data[col].str.replace(r'&', 'and')
    # Remove 'Headline:' and content before it
    Data[col] = Data[col].str.replace(r'[\s\S]*Headline:[\s\w!]*\s\s', '')
    # Remove (), [] and content inside
    Data[col] = Data[col].str.replace(r'([\(\[])[\s\S]*([\)\]])\S*', '')
    # Remove underscores
    Data[col] = Data[col].str.replace(r'_+', ' ')
    # Replace any date by 'date'
    Data[col] = Data[col].str.replace(r'\d+\/\d+\/\d+', 'date')
    # Replace any digit by 'number'
    Data[col] = Data[col].str.replace(r'\d*\.*,*:*\d+', 'number')
    # Remove more than two spaces together
    Data[col] = Data[col].str.replace(r'\s{2,}', ' ')
    # Remove Source url
    Data[col] = Data[col].str.replace(r'\s[Ss]\w*:\s\w*:.*', '')
    # Remove url in general
    Data[col] = Data[col].str.replace(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '')
    return Data

In [5]:
def save_csv(df, path):
    df.to_csv(path + 'processed.csv', index=False)

## Define Path and news types

In [37]:
path = '../Datasets/LittleSplits/'
newstypes = ['fake',
             'reliable',
             'rumor',
             'clickbait',
             'hate',
             'satire',
             'unreliable'
            ]

## Iterate

In [38]:
df  = pd.DataFrame()

for news_set in newstypes:
    newsDF = pd.read_csv(path + news_set + '_little.csv', encoding='utf-8')
    newsDF.drop(columns=['authors'], inplace=True)
    # Preprocess the content and title
    newsDF = preprocess(newsDF, 'content')
    newsDF = preprocess(newsDF, 'title')
    # Keep articles with length more than specified minimum
    newsDF = find_long_articles(newsDF, 1000, 5000)
    objs = [df, newsDF]
    # Join the datasets
    df = pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=True, 
                   keys=None, levels=None, names=None, verify_integrity=False, 
                   copy=True)
    
save_csv(df, path)

## Check

In [35]:
df

Unnamed: 0,type,content,title,length
0,fake,"Quite frankly, I’m surprised it has half left....",Surprise: Socialist Hotbed Of Venezuela Has Lo...,1492
1,fake,"If You Are Upset About Being Called Fake News,...",Water Cooler date Open Thread; Fake News ? CNN...,2355
2,fake,Let’s be honest: This is pretty much all of Fo...,Veteran Commentator Calls Out the Growing “Eth...,2171
3,fake,"Let me tell you something, about otters and mo...","Lost Words, Hidden Words, Otters, Banks and Books",3131
4,fake,Never mind transportation and power generation...,Why Sandwiches Must Be Banned,1650
5,fake,Israelis and Palestinians have always been on ...,Poll: Calls for War From Israelis and Palestin...,1868
6,fake,Every college basketball fan knows all too wel...,"College Basketball Rankings, Week number: Top ...",1017
7,fake,VIEW GALLERY The Boston Celtics are traveling ...,Celtics vs. Lakers Live Stream: Watch Online,3397
8,fake,Gallup released its annual list of the men and...,number Republican Women Just Beat Beyonce For ...,1726
9,fake,Donald Trump opened himself to the White House...,CNN Kept Shouting Questions At Trump — He Sile...,1106


In [13]:
df  = pd.DataFrame()


newsDF = pd.read_csv('../src/datasets/kaggle_processed.csv', encoding='utf-8')
newsDF.head()

Unnamed: 0,Headline,articleBody,target,type
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,unreliable
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,reliable
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,unreliable
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,unreliable
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,unreliable


In [14]:
newdf = newsDF[:500]
newdf.shape

(500, 4)

In [15]:
# newsDF.drop(columns=['authors'], inplace=True)
# Preprocess the content and title
newdf = preprocess(newdf, 'articleBody')
newdf = preprocess(newdf, 'Headline')
# Keep articles with length more than specified minimum
newdf = find_long_articles(newdf, 'articleBody', 1000, 5000)
df.save_csv('kaggle_clean.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/sta

AttributeError: 'DataFrame' object has no attribute 'content'

In [17]:
newdf

Unnamed: 0,Headline,articleBody,target,type
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,unreliable
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,reliable
2,Why the Truth Might Get You Fired,Why the Truth Might Get You Fired October numb...,1,unreliable
3,number Civilians Killed In Single US Airstrike...,Videos number Civilians Killed In Single US Ai...,1,unreliable
4,Iranian woman jailed for fictional unpublished...,Print An Iranian woman has been sentenced to s...,1,unreliable
5,Jackie Mason: Hollywood Would Love Trump if He...,"In these trying times, Jackie Mason is the Voi...",0,reliable
6,Life: Life Of Luxury: Elton John’s number Favo...,Ever wonder how Britain’s most iconic pop pian...,1,unreliable
7,Benoît Hamon Wins French Socialist Party’s Pre...,"PARIS — France chose an idealistic, traditiona...",0,reliable
8,Excerpts From a Draft Script for Donald Trump’...,Donald J. Trump is scheduled to make a highly ...,0,reliable
9,"A Back-Channel Plan for Ukraine and Russia, Co...",A week before Michael T. Flynn resigned as nat...,0,reliable


In [20]:
newdf = find_long_articles(newdf, 'articleBody', 1000, 5000)
newdf.to_csv('kaggle_clean.csv', index=False)

In [27]:
newdf.articleBody[1]

'Ever get the feeling your life circles the roundabout rather than heads in a straight line toward the intended destination? took issue with the previous speaker. Despite becoming the first to win election to a seat in the U. S. Senate since Reconstruction, Edward Brooke came in for criticism for calling for “empathy” for the goals of protestors as he criticized tactics. Though Clinton in her senior thesis on Saul Alinsky lamented “Black Power demagogues” and “elitist arrogance and repressive intolerance” within the New Left, similar words coming out of a Republican necessitated a brief rebuttal. “Trust,” Rodham ironically observed in number, “this is one word that when I asked the class at our rehearsal what it was they wanted me to say for them, everyone came up to me and said ‘Talk about trust, talk about the lack of trust both for us and the way we feel about others. Talk about the trust bust.’ What can you say about it? What can you say about a feeling that permeates a generation 

In [36]:
newdf.drop(columns=['index'], inplace=True)

In [31]:
newdf.reset_index(inplace=True, drop=True)

In [1]:
newdf.head()

NameError: name 'newdf' is not defined

In [39]:
newdf.to_csv('../src/datasets/kaggle_clean.csv', index=False)