In [4]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

data = pd.read_csv('twitter_training.csv', header=None)
validation = pd.read_csv('twitter_validation.csv', header=None)

columns = ['id', 'Company', 'Label', 'Text']
data.columns = columns
validation.columns = columns

data = data.dropna(subset=['Text'])


In [5]:
data.head()

Unnamed: 0,id,Company,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

data['Text'] = data['Text'].apply(clean_text)
validation['Text'] = validation['Text'].apply(clean_text)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['Text'] = data['Text'].apply(preprocess_text)
validation['Text'] = validation['Text'].apply(preprocess_text)

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Label2'] = label_encoder.fit_transform(data['Label'])
validation['Label2'] = label_encoder.transform(validation['Label'])

X_train, X_test, y_train, y_test = train_test_split(data['Text'], data['Label2'], test_size=0.2, random_state=42)

In [7]:
data.head()

Unnamed: 0,id,Company,Label,Text,Label2
0,2401,Borderlands,Positive,im getting borderland murder,3
1,2401,Borderlands,Positive,coming border kill,3
2,2401,Borderlands,Positive,im getting borderland kill,3
3,2401,Borderlands,Positive,im coming borderland murder,3
4,2401,Borderlands,Positive,im getting borderland 2 murder,3


In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag of Words
count_vectorizer = CountVectorizer(max_features=10000, min_df=5)
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, min_df=5)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [9]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize for Word2Vec
X_train_tokens = [text.split() for text in X_train]
X_test_tokens = [text.split() for text in X_test]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=5, sg=0)

# Function to transform text to vectors using Word2Vec
def text_to_word2vec(tokens, model, vector_size=100):
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

# Transform text data to Word2Vec features
X_train_w2v = np.array([text_to_word2vec(tokens, word2vec_model) for tokens in X_train_tokens])
X_test_w2v = np.array([text_to_word2vec(tokens, word2vec_model) for tokens in X_test_tokens])