In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


I'm going to use 3 ways for embeddings
- count vectorizer
- tfidf
- word2vec

In [None]:
import numpy as np
import pandas as pd

In [None]:
train_path = '/content/drive/My Drive/train_clean.csv'
test_path = '/content/drive/My Drive/test_clean.csv'
train_data = pd.read_csv(train_path,sep = ",", header=0)
test_data = pd.read_csv(test_path, sep = ",", header=0)

In [None]:
train = train_data["preprocessed_text"]
train_target = train_data["type"]

test = test_data["preprocessed_text"]
test_target = test_data["type"]

# Create a new DataFrame for the train data with preprocessed text and target
train_combined = pd.DataFrame({
    'preprocessed_text': train_data["preprocessed_text"],
    'target': train_data["type"]
})

# Create a new DataFrame for the test data with preprocessed text and target
test_combined = pd.DataFrame({
    'preprocessed_text': test_data["preprocessed_text"],
    'target': test_data["type"]
})

In [None]:
# deal with missing data and show the missing data rate

def remove_missing_data_and_calculate_rate(df):
    # Step 1: Calculate the missing data rate
    missing_data_rate = (df.isnull().sum() / len(df)) * 100

    # Step 2: Drop rows with missing data
    df_cleaned = df.dropna()

    return df_cleaned, missing_data_rate

train_cleaned, missing_data_rate = remove_missing_data_and_calculate_rate(train_combined)


print("Cleaned Train Data (Rows with missing data removed):")
print(train_cleaned.head())

print("\nMissing Data Rate for each column:")
print(missing_data_rate)

Cleaned Train Data (Rows with missing data removed):
              preprocessed_text    target
0  im getting borderland murder  Positive
1            coming border kill  Positive
2    im getting borderland kill  Positive
3   im coming borderland murder  Positive
4  im getting borderland murder  Positive

Missing Data Rate for each column:
preprocessed_text    2.269175
target               0.000000
dtype: float64


In [None]:
test_cleaned, missing_data_rate = remove_missing_data_and_calculate_rate(test_combined)
print("Cleaned Test Data (Rows with missing data removed):")
print(train_cleaned.head())

print("\nMissing Data Rate for each column:")
print(missing_data_rate)

Cleaned Test Data (Rows with missing data removed):
              preprocessed_text    target
0  im getting borderland murder  Positive
1            coming border kill  Positive
2    im getting borderland kill  Positive
3   im coming borderland murder  Positive
4  im getting borderland murder  Positive

Missing Data Rate for each column:
preprocessed_text    0.1
target               0.0
dtype: float64


In [None]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
len(train)

71656

In [None]:
train = train_cleaned["preprocessed_text"]
train_target = train_cleaned["target"]
test = test_cleaned["preprocessed_text"]
test_target = test_cleaned["target"]

1. count vecorize

In [None]:
def count_vectorizer(train, test = None):
    cv = CountVectorizer()
    train = cv.fit_transform(train)
    if test is not None:
        test = cv.transform(test)
        return train, test, cv
    else:
        return train, cv

In [None]:
train_cv , test_cv, cv = count_vectorizer(train,test)

In [None]:
cv.get_feature_names_out()

array(['aa', 'aaa', 'aaaaaaaaaaaa', ..., 'zysola', 'zzvfsrhewg', 'zzz'],
      dtype=object)

In [None]:
train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

2. tfidf

In [None]:
def tfidf_vectorizer(train, test = None):
    tfidf = TfidfVectorizer(stop_words = "english", ngram_range = (1,2), analyzer = "word",
                            max_df = 0.5, binary = False,  token_pattern=r'\w+', sublinear_tf=False
                           )

    # max_df This parameter sets the maximum document frequency threshold.
    # It means that if a word appears in more than 50% of the documents,
    # it will be ignored because it is considered too common to be useful
    # for distinguishing between documents.
    train = tfidf.fit_transform(train)
    if test is not None:
        test = tfidf.transform(test)
        return train, test, tfidf
    else:
        return train, tfidf

In [None]:
train_tfidf, test_tfidf, tfidf = tfidf_vectorizer(train,test)

3. word2vec

In [None]:
# Function to train Word2Vec and transform data into embeddings
def word2vec(train, test=None):
    # Step 1: Train Word2Vec model on training data
    word2vec_model = Word2Vec(sentences=[i.split() for i in train], vector_size=100, window=5, min_count=1, sg=0)

    # Step 2: Function to generate sentence vectors by averaging word vectors
    def get_sentence_vector(sentence, model, vector_size):
        sentence_vector = np.zeros(vector_size)  # Initialize an empty vector
        count = 0
        for word in sentence.split():
            if word in model.wv:
                sentence_vector += model.wv[word]
                count += 1
        if count != 0:
            sentence_vector /= count  # Average the vectors
        return sentence_vector

    # Step 3: Convert train dataset to sentence vectors
    train_vectors = np.array([get_sentence_vector(sentence, word2vec_model, 100) for sentence in train])

    # If test data is provided, apply the same transformation
    if test is not None:
        test_vectors = np.array([get_sentence_vector(sentence, word2vec_model, 100) for sentence in test])
        return train_vectors, test_vectors, word2vec_model
    else:
        return train_vectors, word2vec_model


In [None]:
train_word2vec, test_word2vec, word2vec = word2vec(train, test)

In [None]:
pubg_vector = word2vec.wv['pubg']
print(f"Vector for 'pubg':\n{pubg_vector}")

Vector for 'pubg':
[-0.01912192  0.8862802  -0.09390827 -2.6960688  -0.26106402 -0.09441577
 -0.53028303 -0.5510987  -1.0323496  -0.67864054 -0.20726399 -0.12606059
 -2.18844    -0.45130172  0.11052801  0.10869858 -0.09418953 -1.0169723
  1.2788637  -2.8027942   1.4971087   0.2049509   1.5463896  -0.076642
 -0.39039338  0.26753086  1.3301667  -1.0254695  -0.68401706 -0.01117503
  1.6573915  -0.09380846 -0.5970899  -2.5046206  -0.19524558 -0.45864773
 -1.1758468  -0.33340088  0.88980734  0.35717857 -0.8066866  -1.1878077
 -0.3714643  -1.1475065  -0.8210059  -0.07598607 -0.1593263  -1.8985881
  0.17395508 -0.29244334 -0.10499473  0.39469022 -0.8089991   0.86934066
  0.5785795  -0.5142811  -0.5436694  -0.42741182  0.23162568 -0.25191343
 -0.47150403 -0.5589146  -0.68859303  0.810955    2.277868    0.53812003
  1.32995    -1.0934389  -1.0493196   0.82483464 -0.34701926  0.31328374
 -0.15807578  0.7734189  -0.6061423  -1.5327737  -0.16377556 -0.29038188
 -0.01483013  0.15960999  0.34047565 

Finally, let's trasform the target into embeddings!

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# 2. Fit the label encoder and transform the target labels to numeric form
y_train_encoded = label_encoder.fit_transform(train_target)
print(y_train_encoded)

y_test_encoded = label_encoder.transform(test_target)
print(y_test_encoded)

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:", label_mapping)

[3 3 3 ... 3 3 3]
[0 2 1 1 2 1 3 3 3 1 3 3 1 2 1 3 3 1 3 1 1 2 0 1 2 2 1 0 0 1 3 3 1 3 1 2 2
 0 3 2 3 2 2 2 3 2 1 1 1 2 3 1 1 3 3 3 3 3 1 0 1 3 3 0 1 2 1 0 2 1 3 1 1 3
 3 0 3 0 2 2 2 3 3 2 3 2 1 0 1 2 2 1 3 0 0 1 1 1 2 3 2 1 3 3 2 3 2 3 1 2 2
 2 1 2 1 2 2 3 3 2 1 1 3 1 2 1 3 2 1 2 0 3 2 3 3 0 2 2 0 0 0 2 2 0 0 0 3 2
 3 0 3 1 2 2 2 0 2 1 2 3 1 2 1 0 0 0 2 1 1 1 3 3 3 2 2 3 0 2 2 2 3 2 1 1 2
 3 3 0 0 2 3 3 2 0 2 1 1 1 1 3 2 2 3 3 3 3 1 3 3 0 2 0 1 1 0 0 1 3 3 1 0 1
 3 3 1 0 0 3 3 1 3 0 2 0 0 1 2 2 3 1 0 0 3 3 0 0 2 3 1 1 3 3 3 3 2 2 3 1 2
 3 2 1 2 2 1 3 3 0 1 2 0 3 2 0 1 2 1 3 3 1 1 1 3 1 2 3 2 2 1 3 1 3 1 0 2 2
 3 1 2 1 0 3 1 3 0 3 3 3 3 3 1 1 3 1 2 2 2 3 0 2 3 0 1 2 2 0 2 2 0 1 3 1 0
 0 3 0 3 2 2 0 0 1 1 1 2 3 0 2 1 3 0 2 1 1 1 3 2 2 0 1 3 3 0 2 0 3 2 2 3 3
 1 2 3 1 2 1 0 1 3 3 0 3 3 2 1 2 0 0 3 2 3 1 1 1 0 3 2 3 0 1 2 0 1 3 3 3 3
 2 2 0 2 1 3 3 2 1 3 2 1 1 1 1 1 2 2 2 3 3 1 1 2 1 2 2 1 0 3 0 0 1 3 3 3 0
 2 3 0 2 2 1 3 2 2 2 2 0 0 0 2 2 0 2 0 2 0 1 1 3 0 0 3 0 0 2 1 1 3 3 1 3 3
 3 3 0 

In [None]:
np.array(y_train_encoded)

array([3, 3, 3, ..., 3, 3, 3])

save 3 types of embeddings

In [None]:
# save y_train
np.save('/content/drive/My Drive/y_train.npy', np.array(y_train_encoded))


# save y_test
np.save('/content/drive/My Drive/y_test.npy', np.array(y_test_encoded))


In [None]:
# save x_train_cv
np.save('/content/drive/My Drive/x_train_cv.npy', train_cv.toarray())

# save x_train_tfidf
np.save('/content/drive/My Drive/x_train_tfidf.npy', train_tfidf.toarray())

# save x_train_word2vec
np.save('/content/drive/My Drive/x_train_word2vec.npy', train_word2vec)

# save x_test_cv
np.save('/content/drive/My Drive/x_test_cv.npy', test_cv.toarray())


# save x_test_tfidf
np.save('/content/drive/My Drive/x_test_tfidf.npy', test_tfidf.toarray())

# save x_test_word2vec
np.save('/content/drive/My Drive/x_test_word2vec.npy', test_word2vec)

OSError: 17630682770 requested and 7829178352 written