In [1]:
import spacy
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df = pd.read_csv("../data/train.csv")

# Feature extraction
### Stanford GloVe, or NLTK 

### bag of n-grams

In [2]:
nlp = spacy.load("en_core_web_sm")
docs = list(nlp.pipe(df["Text"]))

In [3]:

# Count POS tags per sentence
def extract_pos_features(doc):
    pos_counts = Counter(token.pos_ for token in doc)
    return pos_counts

df["pos_counts"] = [extract_pos_features(doc) for doc in docs]

# Convert POS tag dictionary into DataFrame
pos_df = pd.DataFrame(df["pos_counts"].to_list()).fillna(0)

# Merge with original DataFrame
df = pd.concat([df, pos_df], axis=1).drop(columns=["pos_counts"])

In [4]:
df.head()

Unnamed: 0,Sentence_id,Text,Verdict,PRON,VERB,AUX,DET,NOUN,ADP,PUNCT,SCONJ,ADV,CCONJ,PROPN,ADJ,NUM,PART,INTJ,SYM,X
0,1,I think we've seen a deterioration of values.,-1,2.0,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,I think for a while as a nation we condoned th...,-1,3.0,3.0,2.0,3.0,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,"For a while, as I recall, it even seems to me ...",-1,7.0,6.0,1.0,1.0,3.0,3.0,4.0,2.0,1.0,3.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0
3,4,"So we've seen a deterioration in values, and o...",-1,6.0,4.0,3.0,3.0,8.0,7.0,2.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
4,5,"We got away, we got into this feeling that val...",-1,2.0,2.0,1.0,2.0,3.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Convert lemmas into a single string per sentence
df["lemmas_str"] = [" ".join([token.lemma_ for token in doc]) for doc in docs]

# TF-IDF Vectorizer with N-grams
vectorizer = TfidfVectorizer(ngram_range=(2, 3), max_features=5000)  # Unigrams, bigrams, trigrams
X_tfidf = vectorizer.fit_transform(df["lemmas_str"])

# Convert to DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Merge TF-IDF features with main DataFrame
df = pd.concat([df, tfidf_df], axis=1).drop(columns=["lemmas_str"])

In [6]:
# Flatten all POS tags
all_pos_tags = [token.pos_ for doc in docs for token in doc]

# Encode POS tags
le = LabelEncoder()
le.fit(all_pos_tags)

# Convert each sentence into an array of POS tag integers
df["pos_encoded"] = [[le.transform([token.pos_])[0] for token in doc] for doc in docs]

In [7]:
# save features
df.to_csv("../data/features.csv", index=False)

## Prepare data for ML

In [10]:
df = pd.read_csv("../data/features.csv")

In [11]:
df.head()

Unnamed: 0,Sentence_id,Text,Verdict,PRON,VERB,AUX,DET,NOUN,ADP,PUNCT,...,young woman,young worker,your child,your family,your own,your plan,your question,your taxis,your vote,pos_encoded
0,1,I think we've seen a deterioration of values.,-1,2.0,2.0,1.0,1.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[np.int64(10), np.int64(15), np.int64(10), np...."
1,2,I think for a while as a nation we condoned th...,-1,3.0,3.0,2.0,3.0,3.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[np.int64(10), np.int64(15), np.int64(1), np.i..."
2,3,"For a while, as I recall, it even seems to me ...",-1,7.0,6.0,1.0,1.0,3.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[np.int64(1), np.int64(5), np.int64(7), np.int..."
3,4,"So we've seen a deterioration in values, and o...",-1,6.0,4.0,3.0,3.0,8.0,7.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[np.int64(2), np.int64(10), np.int64(3), np.in..."
4,5,"We got away, we got into this feeling that val...",-1,2.0,2.0,1.0,2.0,3.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[np.int64(10), np.int64(15), np.int64(1), np.i..."


In [19]:
# Select feature columns
feature_cols = list(pos_df.columns) + list(tfidf_df.columns) + ["Verdict"]  # Combining POS and TF-IDF

# X = df[feature_cols]
# y = df["Verdict"]

features = df[feature_cols]


In [21]:
features.to_csv('../data/trainable_features.csv', index=False)

# apply the same transformations to test.csv

In [28]:
test_df = pd.read_csv("../data/test.csv")
test_docs = list(nlp.pipe(test_df["Text"]))

# Count POS tags per sentence
def extract_pos_features(doc):
    pos_counts = Counter(token.pos_ for token in doc)
    return pos_counts

test_df["pos_counts"] = [extract_pos_features(doc) for doc in test_docs]

# Convert POS tag dictionary into DataFrame
test_pos_df = pd.DataFrame(test_df["pos_counts"].to_list()).fillna(0)

# Merge with original DataFrame
test_df = pd.concat([test_df, test_pos_df], axis=1).drop(columns=["pos_counts"])

In [None]:
test_df["lemmas_str"] = [" ".join([token.lemma_ for token in doc]) for doc in test_docs]

x_test_tfidf = vectorizer.transform(test_df["lemmas_str"])

# Convert to DataFrame
test_tfidf_df = pd.DataFrame(x_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

test_df = pd.concat([test_df, test_tfidf_df], axis=1).drop(columns=["lemmas_str"])


In [32]:
test_df.columns

Index(['Sentence_id', 'Text', 'PRON', 'VERB', 'PUNCT', 'DET', 'NOUN', 'PROPN',
       'AUX', 'NUM',
       ...
       'young people', 'young woman', 'young worker', 'your child',
       'your family', 'your own', 'your plan', 'your question', 'your taxis',
       'your vote'],
      dtype='object', length=5020)

In [None]:
X_test = test_df.drop(columns=["Sentence_id", "Text"])

X_test.to_csv("../data/X_test_features.csv", index=False)
