# Text Representation

### 1. Gathering Data

In [1]:
import pandas as pd

# Path to your CSV file
csv_file = '../../data/test_sample.csv'

# Read CSV into a DataFrame
df = pd.read_csv(csv_file)
df
df["lemmatized"]
df["sentence_subjectivity"] 

0      ['subj']
1       ['obj']
2      ['subj']
3       ['obj']
4      ['subj']
         ...   
995    ['subj']
996    ['subj']
997     ['obj']
998    ['subj']
999    ['subj']
Name: sentence_subjectivity, Length: 1000, dtype: object

### 2. BOW (Bag Of Words)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
# counts the occurrences of each word.
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df["lemmatized"])
print("after applying BOW : \n")
print("(r_idx, c_idx) cnt\n",X_bow)
features = vectorizer.get_feature_names_out()
print("features :", features)

after applying BOW : 

(r_idx, c_idx) cnt
 <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4054 stored elements and shape (1000, 197)>
  Coords	Values
  (0, 127)	1
  (0, 150)	1
  (0, 129)	1
  (1, 181)	1
  (1, 157)	1
  (1, 143)	1
  (2, 157)	1
  (2, 179)	1
  (2, 180)	1
  (2, 193)	1
  (2, 140)	1
  (2, 87)	1
  (3, 96)	1
  (3, 48)	1
  (3, 144)	1
  (4, 38)	2
  (4, 20)	1
  (4, 192)	1
  (5, 157)	1
  (5, 15)	1
  (5, 145)	1
  (5, 138)	1
  (5, 41)	1
  (6, 114)	1
  (6, 64)	1
  :	:
  (993, 30)	1
  (993, 75)	1
  (994, 127)	1
  (994, 12)	1
  (994, 67)	1
  (994, 139)	1
  (994, 116)	1
  (995, 124)	1
  (995, 156)	1
  (995, 29)	1
  (996, 19)	1
  (996, 65)	1
  (996, 99)	1
  (996, 23)	1
  (997, 185)	1
  (997, 8)	1
  (997, 40)	1
  (997, 178)	1
  (998, 110)	1
  (998, 43)	1
  (998, 1)	1
  (998, 69)	1
  (999, 83)	1
  (999, 11)	1
  (999, 105)	1
features : ['310' 'absolutely' 'acceptable' 'ad' 'alright' 'amaze' 'amazing'
 'another' 'app' 'arrival' 'arrive' 'aspect' 'average' 'awesome' 'awful'
 'bad' 

### 3. Save BOW to csv

In [3]:
bow_df = pd.DataFrame(X_bow.toarray(), columns=features)
bow_df
bow_df.to_csv('../../data/bow_features.csv', index=False)

### 4. TF_IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
#with TF-IDF, its importance is reduced because it’s too common. Instead, rare words like "awesome" or "disappointed" may get higher scores.
tfidf_vectorizer = TfidfVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(df['text'])

features_tfidf = tfidf_vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=features_tfidf)

tfidf_df

Unnamed: 0,10,about,absolutely,acceptable,again,all,alright,amazing,and,another,...,within,worked,working,worse,worst,worth,would,wrong,www,you
0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.406755,0.0,0.0,0.0,0.0
3,0.0,0.444808,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.333817,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
996,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
997,0.0,0.000000,0.000000,0.0,0.0,0.341803,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
998,0.0,0.000000,0.413414,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
