# Convolutional Neural Networks using `word2vec`

In [53]:
import torch
from torchtext import data
from torchtext import datasets
import random
import nltk
import re 
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV

from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

In [7]:
SEED = 1984
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True # for reproducible results 

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.compile('<.*?>').sub(" ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    text = text.split()
    return text

TEXT = data.Field(tokenize=clean_text,lower=True, fix_length=800)
LABEL = data.LabelField(dtype = torch.float)

## Load train and test data

In [30]:
train_df = pd.read_csv('us_news_train.csv')
test_df = pd.read_csv('us_news_test.csv')
validation_df = pd.read_csv('us_news_validation.csv')

In [27]:
train_df.head()

Unnamed: 0,outlet,headline,ideology
0,cnbc,Venezuela's electricity crisis could trigger '...,center
1,cnbc,Canada's Husky Energy offers to buy MEG Energy...,center
2,cnbc,3 innovative new travel apps that save time an...,center
3,cnbc,Germany’s economy is showing ‘signs of fear’ o...,center
4,thehill,The Memo: Is Michelle Obama the one critic Tru...,center


In [33]:
# subset for columns of interest
train_df = train_df[['outlet','headline','ideology']]
test_df = test_df[['outlet','headline','ideology']]

In [38]:
# take the headlines and ideology labels from train and test sets 
headlines_train = train_df['headline'].values
ideo_train = train_df['ideology'].values

headlines_test = test_df['headline'].values
ideo_test = test_df['ideology'].values

## Baseline 

In [52]:
# vectorizing sentences 
vectorizer = CountVectorizer()
vectorizer.fit(headlines_train)

X_train = vectorizer.transform(headlines_train)
X_test = vectorizer.transform(headlines_test)

AttributeError: lower not found