## Transformación y Carga de Datos


#### Importación de librerias

In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import warnings
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string

warnings.filterwarnings('ignore')

#### Lectura de datos con Pandas

In [2]:
df_true = pd.read_csv(r'../dataset/True.csv', sep=',')
df_fake = pd.read_csv(r'../dataset/Fake.csv', sep=',')

#### Procesamiento de datos

In [3]:
#Ver una muestra de los datos
df_true.head(20)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


In [4]:
df_true.text.head(10)

0    WASHINGTON (Reuters) - The head of a conservat...
1    WASHINGTON (Reuters) - Transgender people will...
2    WASHINGTON (Reuters) - The special counsel inv...
3    WASHINGTON (Reuters) - Trump campaign adviser ...
4    SEATTLE/WASHINGTON (Reuters) - President Donal...
5    WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...
6    WEST PALM BEACH, Fla (Reuters) - President Don...
7    The following statements were posted to the ve...
8    The following statements were posted to the ve...
9    WASHINGTON (Reuters) - Alabama Secretary of St...
Name: text, dtype: object

In [5]:
df_true["text"] = df_true["text"].replace("(Reuters)","",regex=True)

In [6]:
#Verificador de dimension del DataFrame (número de filas y columnas)
df_true.shape

(21417, 4)

In [7]:
#Ver una muestra de los datos
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [8]:
#Verificador de dimension del DataFrame (número de filas y columnas)
df_fake.shape

(23481, 4)

In [9]:
#La columna repreenta si el artículo de noticias es real o falso
df_fake["label"] = 0
df_true["label"] = 1

##### Concatenación

In [10]:
#Concatenación de los Dataframes df_fake y df_true
df = pd.concat([df_fake, df_true])
df = df.reset_index(drop=True)

In [11]:
# renombrar columna
df.rename(columns={'text': 'content'}, inplace=True)

##### Eliminación de columnas

In [12]:
#Las siguientes columnas se han eliminado por ser irrelevante para el modelo
df.drop(columns=['date','subject','title'], inplace=True)

In [13]:
#Tipo de Dato de las Columnas
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  44898 non-null  object
 1   label    44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


In [14]:
#Ver una muestra de los datos
df.head()

Unnamed: 0,content,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [15]:
df = df.sample(frac = 1)
df.reset_index(drop=True,inplace=True)

##### Verificando valores nulos

In [16]:
df.isnull().sum()

content    0
label      0
dtype: int64

##### Verificando Duplicados

In [17]:
print(f"Numero de filas duplicadas = {df.duplicated().sum()}")

Numero de filas duplicadas = 6251


In [18]:
# Remover registros duplicados
df = df.drop_duplicates()
df.reset_index(drop=True,inplace=True)

##### Normalización de Datos

In [19]:
#Función que extare comillas  dobles y convierte los valores  de tipo object a minúscula
def getNormalize(text):
    text = str.lstrip(str.rstrip(text))
    text = text.replace('"', '') 
    text = text.replace('-', '')
    text = text.replace('_', '')    
    text = re.sub(r'[()]', '', text)       
    text = re.sub(r'[\[\]]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)   
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = str.lower(text)
    return text

In [20]:
#Aplicación de la función getNormalize
df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').apply(lambda x: x.apply(getNormalize))

In [21]:
df['label'] = df['label'].apply(pd.to_numeric, errors="coerce").astype('category')

In [22]:
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

ps = PorterStemmer()

# Tokenización y stemming 
def stemming(content):
    
    stemmed_content = (ps.stem(word) for word in re.findall('[a-zA-Z]+', content) if word not in stopword_set)
    return ' '.join(stemmed_content)

# stopwords para búsquedas más eficientes
stopword_set = set(stopwords.words('english'))

In [23]:
df['content'] = df['content'].apply(stemming)

In [None]:
#Utilizando la biblioteca pickle de Python para generar un archivo de escritura binaria
pickle_out = open("../dataset/data.pkl","wb")
pickle.dump(df, pickle_out)
pickle_out.close()