# **Text Presentation in NLP**

In [16]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [17]:
df = pd.read_csv('twitter_training.csv', usecols=['Tweet_Content'])
df.head()

Unnamed: 0,Tweet_Content
0,im getting on borderlands and i will murder yo...
1,I am coming to the borders and I will kill you...
2,im getting on borderlands and i will kill you ...
3,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...


In [18]:
df.shape

(74682, 1)

## **OneHotEncoding**

In [19]:
df = df.head(5000)

In [20]:
df = df.dropna(subset=['Tweet_Content'])

texts = df['Tweet_Content'].tolist()

tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(texts)

onehot_encoded = tokenizer.texts_to_matrix(texts, mode='binary')

onehot_encoded

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

## **Bag of Words with n grams**

### **Uni gram**

In [21]:
vectorizer = CountVectorizer(ngram_range=(1, 1))

bow_matrix = vectorizer.fit_transform(df)

feature_names = vectorizer.get_feature_names_out()

print("Feature names (words and n-grams):")
print(feature_names)
print("\nBag of Words matrix:")
print(bow_matrix.toarray())

Feature names (words and n-grams):
['tweet_content']

Bag of Words matrix:
[[1]]


### **Bi gram**

In [22]:
def remove_stop_words(text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

df['Tweet_Content'] = df['Tweet_Content'].apply(remove_stop_words)

In [23]:
vectorizer = CountVectorizer(ngram_range=(2, 2))

bow_matrix = vectorizer.fit_transform(df['Tweet_Content'])

feature_names = vectorizer.get_feature_names_out()

print("Feature names (words and n-grams):")
print(feature_names)

print("\nBag of Words matrix (sparse format):")
print(bow_matrix)

Feature names (words and n-grams):
['00 125' '00 borderlands' '000 grant' ... 'حبيت اللعبه' 'خلاص unk'
 'خلاص حبيت']

Bag of Words matrix (sparse format):
  (0, 8946)	1
  (0, 7492)	1
  (0, 2454)	1
  (1, 3793)	1
  (1, 2649)	1
  (2, 8946)	1
  (2, 7492)	1
  (2, 2415)	1
  (3, 2454)	1
  (3, 8937)	1
  (3, 3792)	1
  (4, 8946)	1
  (4, 7492)	1
  (4, 2454)	1
  (5, 8946)	1
  (5, 7492)	1
  (5, 2454)	1
  (6, 15888)	1
  (6, 8784)	1
  (6, 10720)	1
  (6, 15705)	1
  (6, 6910)	1
  (6, 9510)	1
  (6, 8819)	1
  (6, 2339)	1
  :	:
  (4950, 19409)	1
  (4950, 7496)	1
  (4950, 5196)	1
  (4950, 3526)	1
  (4951, 9641)	1
  (4951, 8811)	1
  (4951, 1082)	2
  (4951, 1052)	1
  (4951, 17883)	1
  (4951, 9690)	1
  (4951, 15210)	1
  (4951, 8652)	1
  (4951, 632)	1
  (4951, 2945)	1
  (4951, 17336)	1
  (4951, 323)	1
  (4951, 17334)	1
  (4951, 278)	1
  (4951, 19644)	1
  (4951, 2946)	1
  (4951, 15648)	1
  (4951, 19409)	1
  (4951, 7496)	1
  (4951, 5196)	1
  (4951, 3526)	1


### **Tri gram**

In [24]:
vectorizer = CountVectorizer(ngram_range=(3, 3))

bow_matrix = vectorizer.fit_transform(df['Tweet_Content'])

feature_names = vectorizer.get_feature_names_out()

print("Feature names (words and n-grams):")
print(feature_names)

print("\nBag of Words matrix (sparse format):")
print(bow_matrix)

Feature names (words and n-grams):
['00 125 euro' '00 borderlands great' '000 grant fqc' ...
 'خلاص unk اللعبه' 'خلاص حبيت bin' 'خلاص حبيت اللعبه']

Bag of Words matrix (sparse format):
  (0, 10867)	1
  (0, 9109)	1
  (1, 4768)	1
  (2, 10867)	1
  (2, 9108)	1
  (3, 10855)	1
  (3, 4766)	1
  (4, 10867)	1
  (4, 9109)	1
  (5, 10867)	1
  (5, 9109)	1
  (6, 19153)	1
  (6, 10650)	1
  (6, 13033)	1
  (6, 18957)	1
  (6, 8362)	1
  (6, 11497)	1
  (6, 10725)	1
  (6, 2828)	1
  (6, 7291)	1
  (6, 13255)	1
  (6, 14793)	1
  (6, 7407)	1
  (6, 4146)	1
  (6, 5652)	1
  :	:
  (4950, 23391)	1
  (4950, 9113)	1
  (4950, 6378)	1
  (4950, 10696)	1
  (4951, 1241)	1
  (4951, 21518)	1
  (4951, 11730)	1
  (4951, 18387)	1
  (4951, 10490)	1
  (4951, 748)	1
  (4951, 3691)	1
  (4951, 1272)	1
  (4951, 20831)	1
  (4951, 367)	1
  (4951, 1270)	1
  (4951, 20830)	1
  (4951, 317)	1
  (4951, 11669)	1
  (4951, 23695)	1
  (4951, 3692)	1
  (4951, 18887)	1
  (4951, 23391)	1
  (4951, 9113)	1
  (4951, 6378)	1
  (4951, 10696)	1


### **Quad gram**

In [25]:
vectorizer = CountVectorizer(ngram_range=(4, 4))

bow_matrix = vectorizer.fit_transform(df['Tweet_Content'])

feature_names = vectorizer.get_feature_names_out()

print("Feature names (words and n-grams):")
print(feature_names)

print("\nBag of Words matrix (sparse format):")
print(bow_matrix)

Feature names (words and n-grams):
['00 125 euro ift' '00 borderlands great week' '000 grant fqc bri' ...
 'خلاص unk اللعبه callofduty' 'خلاص حبيت bin اللعبه'
 'خلاص حبيت اللعبه callofduty']

Bag of Words matrix (sparse format):
  (0, 11358)	1
  (2, 11357)	1
  (3, 11347)	1
  (4, 11358)	1
  (5, 11358)	1
  (6, 20151)	1
  (6, 11195)	1
  (6, 13626)	1
  (6, 19934)	1
  (6, 8784)	1
  (6, 12017)	1
  (6, 11229)	1
  (6, 2943)	1
  (6, 7662)	1
  (6, 13853)	1
  (6, 15477)	1
  (6, 7786)	1
  (6, 4392)	1
  (6, 5995)	1
  (6, 13586)	1
  (6, 23358)	1
  (6, 16259)	1
  (6, 15931)	1
  (6, 11390)	1
  (6, 23046)	1
  :	:
  (4950, 3894)	1
  (4950, 19864)	1
  (4950, 24507)	1
  (4950, 9584)	1
  (4950, 6745)	1
  (4951, 1309)	1
  (4951, 22555)	1
  (4951, 12262)	1
  (4951, 19360)	1
  (4951, 11025)	1
  (4951, 801)	1
  (4951, 3893)	1
  (4951, 1336)	1
  (4951, 21901)	1
  (4951, 388)	1
  (4951, 1335)	1
  (4951, 21900)	1
  (4951, 335)	1
  (4951, 12204)	1
  (4951, 24829)	1
  (4951, 3894)	1
  (4951, 19864)	1
  (4951, 24507

### **Uni Bi gram**

In [26]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

bow_matrix = vectorizer.fit_transform(df['Tweet_Content'])

feature_names = vectorizer.get_feature_names_out()

print("Feature names (words and n-grams):")
print(feature_names)

print("\nBag of Words matrix (sparse format):")
print(bow_matrix)

Feature names (words and n-grams):
['00' '00 125' '00 borderlands' ... 'خلاص' 'خلاص unk' 'خلاص حبيت']

Bag of Words matrix (sparse format):
  (0, 11249)	1
  (0, 9437)	1
  (0, 2865)	1
  (0, 14492)	1
  (0, 11261)	1
  (0, 9441)	1
  (0, 3084)	1
  (1, 4764)	1
  (1, 3280)	1
  (1, 11938)	1
  (1, 4767)	1
  (1, 3283)	1
  (2, 11249)	1
  (2, 9437)	1
  (2, 2865)	1
  (2, 11261)	1
  (2, 9441)	1
  (2, 11938)	1
  (2, 3045)	1
  (3, 11249)	1
  (3, 2865)	1
  (3, 14492)	1
  (3, 3084)	1
  (3, 4764)	1
  (3, 11252)	1
  :	:
  (4951, 12293)	1
  (4951, 19280)	1
  (4951, 10876)	1
  (4951, 866)	1
  (4951, 3683)	2
  (4951, 437)	1
  (4951, 367)	1
  (4951, 9474)	1
  (4951, 22683)	1
  (4951, 12294)	1
  (4951, 19281)	1
  (4951, 10877)	1
  (4951, 867)	1
  (4951, 3684)	1
  (4951, 22001)	1
  (4951, 438)	1
  (4951, 21999)	1
  (4951, 368)	1
  (4951, 24830)	1
  (4951, 3685)	1
  (4951, 19876)	1
  (4951, 24539)	1
  (4951, 9445)	1
  (4951, 6639)	1
  (4951, 4472)	1


### **Bi Tri gram**

In [27]:
vectorizer = CountVectorizer(ngram_range=(2, 3))

bow_matrix = vectorizer.fit_transform(df['Tweet_Content'])

feature_names = vectorizer.get_feature_names_out()

print("Feature names (words and n-grams):")
print(feature_names)

print("\nBag of Words matrix (sparse format):")
print(bow_matrix)

Feature names (words and n-grams):
['00 125' '00 125 euro' '00 borderlands' ... 'خلاص حبيت' 'خلاص حبيت bin'
 'خلاص حبيت اللعبه']

Bag of Words matrix (sparse format):
  (0, 19813)	1
  (0, 16600)	1
  (0, 5455)	1
  (0, 19814)	1
  (0, 16602)	1
  (1, 8561)	1
  (1, 5977)	1
  (1, 8562)	1
  (2, 19813)	1
  (2, 16600)	1
  (2, 19814)	1
  (2, 5366)	1
  (2, 16601)	1
  (3, 5455)	1
  (3, 19792)	1
  (3, 8558)	1
  (3, 19793)	1
  (3, 8559)	1
  (4, 19813)	1
  (4, 16600)	1
  (4, 5455)	1
  (4, 19814)	1
  (4, 16602)	1
  (5, 19813)	1
  (5, 16600)	1
  :	:
  (4951, 42800)	1
  (4951, 16609)	1
  (4951, 11574)	1
  (4951, 7895)	1
  (4951, 2294)	1
  (4951, 39402)	1
  (4951, 21421)	1
  (4951, 33598)	1
  (4951, 19143)	1
  (4951, 1381)	1
  (4951, 6637)	1
  (4951, 2355)	1
  (4951, 38168)	1
  (4951, 691)	1
  (4951, 2353)	1
  (4951, 38165)	1
  (4951, 596)	1
  (4951, 21311)	1
  (4951, 43340)	1
  (4951, 6639)	1
  (4951, 34536)	1
  (4951, 42801)	1
  (4951, 16610)	1
  (4951, 11575)	1
  (4951, 19508)	1


### **Tri Quad gram**

In [28]:
vectorizer = CountVectorizer(ngram_range=(3, 4))

bow_matrix = vectorizer.fit_transform(df['Tweet_Content'])

feature_names = vectorizer.get_feature_names_out()

print("Feature names (words and n-grams):")
print(feature_names)

print("\nBag of Words matrix (sparse format):")
print(bow_matrix)

Feature names (words and n-grams):
['00 125 euro' '00 125 euro ift' '00 borderlands great' ...
 'خلاص حبيت bin اللعبه' 'خلاص حبيت اللعبه' 'خلاص حبيت اللعبه callofduty']

Bag of Words matrix (sparse format):
  (0, 22224)	1
  (0, 18689)	1
  (0, 22226)	1
  (1, 9866)	1
  (2, 22224)	1
  (2, 18688)	1
  (2, 22225)	1
  (3, 22202)	1
  (3, 9863)	1
  (3, 22203)	1
  (4, 22224)	1
  (4, 18689)	1
  (4, 22226)	1
  (5, 22224)	1
  (5, 18689)	1
  (5, 22226)	1
  (6, 39304)	1
  (6, 21845)	1
  (6, 26659)	1
  (6, 38891)	1
  (6, 17146)	1
  (6, 23514)	1
  (6, 21954)	1
  (6, 5771)	1
  (6, 14953)	1
  :	:
  (4951, 38751)	1
  (4951, 47898)	1
  (4951, 18697)	1
  (4951, 13123)	1
  (4951, 21920)	1
  (4951, 2551)	1
  (4951, 44074)	1
  (4951, 23993)	1
  (4951, 37748)	1
  (4951, 21516)	1
  (4951, 1550)	1
  (4951, 7585)	1
  (4951, 2609)	1
  (4951, 42733)	1
  (4951, 756)	1
  (4951, 2606)	1
  (4951, 42731)	1
  (4951, 653)	1
  (4951, 23874)	1
  (4951, 48525)	1
  (4951, 7587)	1
  (4951, 38752)	1
  (4951, 47899)	1
  (4951, 18

## **Tf- Idf**

In [29]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df)

feature_names = tfidf_vectorizer.get_feature_names_out()

print("Feature names (terms):")
print(feature_names)
print("\nTF-IDF matrix:")
print(tfidf_matrix.toarray())

Feature names (terms):
['tweet_content']

TF-IDF matrix:
[[1.]]
