In [1]:
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
nltk.download('reuters')
nltk.download('inaugural')

[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package reuters to /home/alex/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package inaugural to /home/alex/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!


True

In [3]:
stops = set(stopwords.words('german'))

In [4]:
words = ["In", "einigen", "U-Bahn-Stationen", "in", "Sankt", "Petersburg", "und", "Moskau", "gelten", "besondere", "Sicherheitsvorkehrungen"]

In [5]:
[word for word in words if word not in stops]

['In',
 'U-Bahn-Stationen',
 'Sankt',
 'Petersburg',
 'Moskau',
 'gelten',
 'besondere',
 'Sicherheitsvorkehrungen']

In [6]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [7]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
def para_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    para = [word for word in text if word.lower() not in stopwords]
    return len(para) / len(text)

In [9]:
para_fraction(nltk.corpus.reuters.words())

0.735240435097661

In [10]:
para_fraction(nltk.corpus.inaugural.words())

0.5235285085816138

In [11]:
# Stemming

In [12]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [13]:
lan_stemmer = LancasterStemmer()
port_stemmer = PorterStemmer()
snow_stemmer = SnowballStemmer('english')

In [14]:
words = ['Scientist', 'discovered', 'infections', 'invade']

In [15]:
for word in words:
    print(word + '  - - ' + lan_stemmer.stem(word) + '  --  ' + port_stemmer.stem(word) + '  --  ' + snow_stemmer.stem(word)) 

Scientist  - - sci  --  scientist  --  scientist
discovered  - - discov  --  discov  --  discov
infections  - - infect  --  infect  --  infect
invade  - - invad  --  invad  --  invad


In [16]:
# Lematization

In [17]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
lemmatizer = WordNetLemmatizer()
for word in words:
        print(word + '  - - ' + lemmatizer.lemmatize(word, pos="v")) 

Scientist  - - Scientist
discovered  - - discover
infections  - - infections
invade  - - invade


In [19]:
text = "First up is so-called Motion Mode, yet another feature bringing the Pixel lineup closer in functionality to much more expensive standalone cameras. This camera feature allows you to take sports and action shots — with moving subjects in the foreground and blurry backgrounds."

In [21]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [23]:
tokens = word_tokenize(text.lower())

In [24]:
stemmer = PorterStemmer()
porter_stemmed = [stemmer.stem(token) for token in tokens]

In [25]:
stemmer = SnowballStemmer('english')
snowball_stemmed = [stemmer.stem(token) for token in tokens]

In [29]:
df = pd.DataFrame({'token': tokens, 'porter_stemmed': porter_stemmed, 'snowball_stemmed': snowball_stemmed})

In [30]:
df = df[['token', 'porter_stemmed', 'snowball_stemmed']]

In [31]:
df

Unnamed: 0,token,porter_stemmed,snowball_stemmed
0,first,first,first
1,up,up,up
2,is,is,is
3,so-called,so-cal,so-cal
4,motion,motion,motion
5,mode,mode,mode
6,",",",",","
7,yet,yet,yet
8,another,anoth,anoth
9,feature,featur,featur
