# Preprocess

In [5]:
import pandas as pd

In [15]:
def find_long_articles(df, min_len, max_len):
    """
    Returns the DataFrame with content length more than
    specific charaters.
    """
    df['length'] = df.content.str.len()
    return df[(df.length > min_len) & (df.length < max_len)]

In [20]:
def preprocess(Data, col):
    """
    Strip or replace unnecessary details.
    """
    # Remove ampersand
    Data[col] = Data[col].str.replace(r'[%$#]*', '')
    # Replace & by 'and'
    Data[col] = Data[col].str.replace(r'&', 'and')
    # Remove 'Headline:' and content before it
    Data[col] = Data[col].str.replace(r'[\s\S]*Headline:[\s\w!]*\s\s', '')
    # Remove (), [] and content inside
    Data[col] = Data[col].str.replace(r'([\(\[])[\s\S]*([\)\]])\S*', '')
    # Remove underscores
    Data[col] = Data[col].str.replace(r'_+', ' ')
    # Replace any date by 'date'
    Data[col] = Data[col].str.replace(r'\d+\/\d+\/\d+', 'date')
    # Replace any digit by 'number'
    Data[col] = Data[col].str.replace(r'\d*\.*,*:*\d+', 'number')
    # Remove more than two spaces together
    Data[col] = Data[col].str.replace(r'\s{2,}', ' ')
    # Remove Source url
    Data[col] = Data[col].str.replace(r'\s[Ss]\w*:\s\w*:.*', '')
    # Remove url in general
    Data[col] = Data[col].str.replace(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '')
    return Data

In [36]:
def save_csv(df, path):
    df.to_csv(path + 'processed.csv', index=False)

## Define Path and news types

In [37]:
path = '../Datasets/LittleSplits/'
newstypes = ['fake',
             'reliable',
             'rumor',
             'clickbait',
             'hate',
             'satire',
             'unreliable'
            ]

## Iterate

In [38]:
df  = pd.DataFrame()

for news_set in newstypes:
    newsDF = pd.read_csv(path + news_set + '_little.csv', encoding='utf-8')
    newsDF.drop(columns=['authors'], inplace=True)
    # Preprocess the content and title
    newsDF = preprocess(newsDF, 'content')
    newsDF = preprocess(newsDF, 'title')
    # Keep articles with length more than specified minimum
    newsDF = find_long_articles(newsDF, 1000, 5000)
    objs = [df, newsDF]
    # Join the datasets
    df = pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=True, 
                   keys=None, levels=None, names=None, verify_integrity=False, 
                   copy=True)
    
save_csv(df, path)

## Check

In [35]:
df

Unnamed: 0,type,content,title,length
0,fake,"Quite frankly, I’m surprised it has half left....",Surprise: Socialist Hotbed Of Venezuela Has Lo...,1492
1,fake,"If You Are Upset About Being Called Fake News,...",Water Cooler date Open Thread; Fake News ? CNN...,2355
2,fake,Let’s be honest: This is pretty much all of Fo...,Veteran Commentator Calls Out the Growing “Eth...,2171
3,fake,"Let me tell you something, about otters and mo...","Lost Words, Hidden Words, Otters, Banks and Books",3131
4,fake,Never mind transportation and power generation...,Why Sandwiches Must Be Banned,1650
5,fake,Israelis and Palestinians have always been on ...,Poll: Calls for War From Israelis and Palestin...,1868
6,fake,Every college basketball fan knows all too wel...,"College Basketball Rankings, Week number: Top ...",1017
7,fake,VIEW GALLERY The Boston Celtics are traveling ...,Celtics vs. Lakers Live Stream: Watch Online,3397
8,fake,Gallup released its annual list of the men and...,number Republican Women Just Beat Beyonce For ...,1726
9,fake,Donald Trump opened himself to the White House...,CNN Kept Shouting Questions At Trump — He Sile...,1106
