## Transformación y Carga de Datos


#### Importación de librerias

In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import warnings
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string

warnings.filterwarnings('ignore')

#### Lectura de datos con Pandas

In [2]:
df_true = pd.read_csv(r'../dataset/True.csv', sep=',')
df_fake = pd.read_csv(r'../dataset/Fake.csv', sep=',')

#### Procesamiento de datos

In [3]:
#Ver una muestra de los datos
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
#Verificador de dimension del DataFrame (número de filas y columnas)
df_true.shape

(21417, 4)

In [5]:
#Ver una muestra de los datos
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
#Verificador de dimension del DataFrame (número de filas y columnas)
df_fake.shape

(23481, 4)

In [7]:
#La columna repreenta si el artículo de noticias es real o falso
df_fake["label"] = 0
df_true["label"] = 1

##### Concatenación

In [8]:
#Concatenación de los Dataframes df_fake y df_true
df = pd.concat([df_fake, df_true])
df = df.reset_index(drop=True)

In [9]:
#Creación de la columna contetnt a partir de las columnas titulo y tema
df['content'] = df['subject'] + ' ' + df['title'] +  ' ' +  df['text']

##### Eliminación de columnas

In [10]:
#Las siguientes columnas se han eliminado por ser irrelevante para el modelo
df.drop(columns=['date','subject','title','text'], inplace=True)

In [11]:
#Tipo de Dato de las Columnas
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    44898 non-null  int64 
 1   content  44898 non-null  object
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


In [12]:
#Ver una muestra de los datos
df.head()

Unnamed: 0,label,content
0,0,News Donald Trump Sends Out Embarrassing New ...
1,0,News Drunk Bragging Trump Staffer Started Rus...
2,0,News Sheriff David Clarke Becomes An Internet...
3,0,News Trump Is So Obsessed He Even Has Obama’s...
4,0,News Pope Francis Just Called Out Donald Trum...


##### Verificando valores nulos

In [13]:
df.isnull().sum()

label      0
content    0
dtype: int64

##### Verificando Duplicados

In [14]:
print(f"Numero de filas duplicadas = {df.duplicated().sum()}")

Numero de filas duplicadas = 213


In [15]:
# Remover registros duplicados
df = df.drop_duplicates()
df.reset_index(drop=True,inplace=True)

##### Normalización de Datos

In [16]:
#Función que extare comillas  dobles y convierte los valores  de tipo object a minúscula
def getNormalize(text):
    text = str.lstrip(str.rstrip(text))
    text = text.replace('"', '')
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    text = str.lower(text)
    return text

In [17]:
#Aplicación de la función getNormalize
df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').apply(lambda x: x.apply(getNormalize))

In [18]:
df['label'] = df['label'].apply(pd.to_numeric, errors="coerce").astype('category')

In [19]:
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

ps = PorterStemmer()

# Tokenización y stemming 
def stemming(content):
    
    stemmed_content = (ps.stem(word) for word in re.findall('[a-zA-Z]+', content) if word not in stopword_set)
    return ' '.join(stemmed_content)

# stopwords para búsquedas más eficientes
stopword_set = set(stopwords.words('english'))

In [20]:
df['content'] = df['content'].apply(stemming)

In [21]:
df

Unnamed: 0,label,content
0,0,news donald trump send embarrass new year eve ...
1,0,news drunk brag trump staffer start russian co...
2,0,news sheriff david clark becom internet joke t...
3,0,news trump obsess even obama name code websit ...
4,0,news pope franci call donald trump christma sp...
...,...,...
44680,1,worldnew fulli commit nato back new u approach...
44681,1,worldnew lexisnexi withdrew two product chines...
44682,1,worldnew minsk cultur hub becom author minsk r...
44683,1,worldnew vatican upbeat possibl pope franci vi...


In [22]:
#Utilizando la biblioteca pickle de Python para generar un archivo de escritura binaria
pickle_out = open("../dataset/data.pkl","wb")
pickle.dump(df, pickle_out)
pickle_out.close()