# Bag of words (countvectorizer)

In [114]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [115]:
df = pd.DataFrame({"Text":["i love do love" , "I love cat cat" , "i love bharat inidia" , "i love football cat"]})

In [116]:
df

Unnamed: 0,Text
0,i love do love
1,I love cat cat
2,i love bharat inidia
3,i love football cat


In [117]:
cv = CountVectorizer(tokenizer=lambda txt: txt.split())

In [118]:
bow = cv.fit_transform(df["Text"])

In [119]:
cv.get_feature_names_out()

array(['bharat', 'cat', 'do', 'football', 'i', 'inidia', 'love'],
      dtype=object)

In [120]:
print(bow.toarray())

[[0 0 1 0 1 0 2]
 [0 2 0 0 1 0 1]
 [1 0 0 0 1 1 1]
 [0 1 0 1 1 0 1]]


In [121]:
cv.vocabulary_

{'i': 4, 'love': 6, 'do': 2, 'cat': 1, 'bharat': 0, 'inidia': 5, 'football': 3}

# TF-IDF

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [123]:
df = pd.DataFrame({"Text":[" love do love" , " love cat cat" , "i love bharat india" , "i love football cat"]})
df

Unnamed: 0,Text
0,love do love
1,love cat cat
2,i love bharat india
3,i love football cat


In [124]:
tfidf = TfidfVectorizer(tokenizer=lambda txt: txt.split())

In [125]:
matrix = tfidf.fit_transform(df["Text"])
matrix.shape

(4, 7)

In [126]:
matrix.toarray()

array([[0.        , 0.        , 0.69183461, 0.        , 0.        ,
        0.        , 0.722056  ],
       [0.        , 0.94936136, 0.        , 0.        , 0.        ,
        0.        , 0.31418628],
       [0.58783765, 0.        , 0.        , 0.        , 0.46345796,
        0.58783765, 0.30675807],
       [0.        , 0.4970962 , 0.        , 0.6305035 , 0.4970962 ,
        0.        , 0.32902288]])

In [127]:
tfidf.vocabulary_

{'love': 6, 'do': 2, 'cat': 1, 'i': 4, 'bharat': 0, 'india': 5, 'football': 3}

# N-Grams


In [128]:
from nltk import ngrams

In [129]:
text = "this is nitin and he is very smart and playes only mage never rotate"

In [130]:
n = 4

In [131]:
n_grams = ngrams(text.split(), n)

In [134]:
for grams in n_grams:
    print(grams)

('this', 'is', 'nitin', 'and')
('is', 'nitin', 'and', 'he')
('nitin', 'and', 'he', 'is')
('and', 'he', 'is', 'very')
('he', 'is', 'very', 'smart')
('is', 'very', 'smart', 'and')
('very', 'smart', 'and', 'playes')
('smart', 'and', 'playes', 'only')
('and', 'playes', 'only', 'mage')
('playes', 'only', 'mage', 'never')
('only', 'mage', 'never', 'rotate')


# Sentiment Analysis

In [135]:
from textblob import TextBlob

In [136]:
df = pd.DataFrame({"Text":[" love my india" , "i hate bad words" , "im an pure soule" , "pepole always has evil mind"]})
df

Unnamed: 0,Text
0,love my india
1,i hate bad words
2,im an pure soule
3,pepole always has evil mind


In [137]:
def sentiment(text:str):

    sen = TextBlob(text)
    number = sen.sentiment.polarity

    if number>=0.5:
        return "Positive"
    
    elif number < 0:
        return "Negtive"
    
    else:
        return "Neutral"

In [138]:
df["sentiment"] = df["Text"].apply(sentiment)

In [139]:
df

Unnamed: 0,Text,sentiment
0,love my india,Positive
1,i hate bad words,Negtive
2,im an pure soule,Neutral
3,pepole always has evil mind,Negtive


# Web Scraping

In [140]:
import requests
import re 
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np 

In [152]:
r = requests.get("https://collegedunia.com/college/3717-ramniranjan-jhunjhunwala-college-rjc-mumbai/reviews")
print(r.status_code)
soup = BeautifulSoup(r.text , "html.parser")
regex = re.compile(".*jsx-4290025793.*")
results = soup.find_all('p' , {'class':regex})
reviews = [result.text  for result in results]
reviews

403


[]

In [148]:
df = pd.DataFrame(np.array(reviews) , columns = ["reviews"])

In [149]:
df

Unnamed: 0,reviews


In [155]:
import gensim.downloader as api

In [156]:
word2vec = api.load("word2vec-google-news-300")



In [157]:
word2vec['cat'].shape

(300,)

In [158]:
word2vec.most_similar('cat')

[('cats', 0.8099379539489746),
 ('dog', 0.760945737361908),
 ('kitten', 0.7464985251426697),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150582671165466),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931973457336),
 ('chihuahua', 0.6709762215614319)]

In [159]:
word2vec.doesnt_match(['cat', 'kitten', 'elephant'])

'elephant'

In [160]:
vec2 = word2vec['INR'] - word2vec['INDIA'] + word2vec['ENGLAND']
word2vec.most_similar([vec2])

[('INR', 0.5168343186378479),
 ('ENGLAND', 0.5120964050292969),
 ('GBP', 0.4450418949127197),
 ('BLACKBURN_Rovers', 0.42433300614356995),
 ('Goalkeeper_Maik_Taylor', 0.4195941090583801),
 ('Sol_Campbell_Sylvain_Distin', 0.4178478419780731),
 ('Ledley_King_Jamie_Carragher', 0.4158109724521637),
 ('striker_Emile_Heskey', 0.41200312972068787),
 ('#,###.##_PER_GBP', 0.40780407190322876),
 ('Â_£', 0.40709665417671204)]

# Parser