# Natural Language Processing DAY 5

# Term Frequency and Inverse Document Frequency

In [1]:
# since we all know that in the case of bag of words all the words were getting the same weightage

# so to avoid that we can use the concept of the term frequency and the inverse document frequency

In [2]:
import nltk

In [3]:
paragraph = """Glaciers are melting, sea levels are rising, cloud forests are dying, and wildlife is scrambling to keep pace. 
It has become clear that humans have caused most of the past century's warming by releasing heat-trapping gases as we power our modern lives. 
Called greenhouse gases, their levels are higher now than at any time in the last 800,000 years.

We often call the result global warming,
but it is causing a set of changes to the Earth's climate, or long-term weather patterns,
that varies from place to place. While many people think of global warming and climate change as synonyms, 
scientists use “climate change” when describing the complex shifts now affecting our planet’s weather and climate systems—in part because some areas actually get cooler in the short term.

Climate change encompasses not only rising average temperatures but also extreme weather events,
shifting wildlife populations and habitats, rising seas,
and a range of other impacts.
All of those changes are emerging as humans continue to add heat-trapping greenhouse gases to the atmosphere, 
changing the rhythms of climate that all living things have come to rely on.

What will we do—what can we do—to slow this human-caused warming?
How will we cope with the changes we've already set into motion?
While we struggle to figure it all out, the fate of the Earth as we know it—coasts, forests,
farms, and snow-capped mountains—hangs in the balance. """

In [4]:
# data cleaning libraries
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [6]:
sentences = nltk.sent_tokenize(paragraph)

In [7]:
import pandas as pd

In [8]:
sent = pd.DataFrame(sentences)

In [9]:
sent

Unnamed: 0,0
0,"Glaciers are melting, sea levels are rising, c..."
1,It has become clear that humans have caused mo...
2,"Called greenhouse gases, their levels are high..."
3,"We often call the result global warming,\nbut ..."
4,While many people think of global warming and ...
5,Climate change encompasses not only rising ave...
6,All of those changes are emerging as humans co...
7,What will we do—what can we do—to slow this hu...
8,How will we cope with the changes we've alread...
9,"While we struggle to figure it all out, the fa..."


In [10]:
# performing the cleaning
corpus =[]
for i in range(len(sentences)):
    review = re.sub('[^A-Za-z]',' ',sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review =' '.join(review)
    corpus.append(review)

In [11]:
sent2 = pd.DataFrame(corpus)

In [12]:
sent2

Unnamed: 0,0
0,glacier melting sea level rising cloud forest ...
1,become clear human caused past century warming...
2,called greenhouse gas level higher time last year
3,often call result global warming causing set c...
4,many people think global warming climate chang...
5,climate change encompasses rising average temp...
6,change emerging human continue add heat trappi...
7,slow human caused warming
8,cope change already set motion
9,struggle figure fate earth know coast forest f...


In [13]:
# so we performed the lemmatization removing all the words present in the stop words

In [14]:
# creating the TF IDF MODEL

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
cv = TfidfVectorizer()

In [17]:
x = cv.fit_transform(corpus).toarray()

In [18]:
x

array([[0.        , 0.        , 0.        , ..., 0.        , 0.26094012,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.37800648],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [19]:
data_corpus = pd.DataFrame(x)

In [20]:
data_corpus.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26094,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29221,...,0.0,0.0,0.0,0.248405,0.0,0.0,0.193217,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.378006,0.0,0.0,0.0,0.0,0.0,0.0,0.378006
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.252612,0.167034,0.187875,0.0,0.0
4,0.196947,0.0,0.196947,0.0,0.0,0.196947,0.0,0.0,0.0,0.0,...,0.0,0.196947,0.0,0.0,0.196947,0.0,0.130227,0.146475,0.0,0.0
5,0.0,0.0,0.0,0.0,0.244859,0.0,0.0,0.244859,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182109,0.208153,0.0
6,0.0,0.266652,0.0,0.0,0.0,0.0,0.266652,0.0,0.0,0.0,...,0.266652,0.0,0.0,0.226678,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.401445,0.0,0.0,0.0
8,0.0,0.0,0.0,0.49536,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.283463,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
