In [1]:
import pandas as pd
df = pd.read_csv('train_test.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,Free Natural Parenting Magazine - Positive Par...,environment
1,1,7 Ways to Keep Our Environment Clean and Safe,environment
2,2,With environmental issues like water contamina...,environment
3,3,Today we’re looking at the steps – big and sma...,environment
4,4,"Make your voice heard: vote, sign petitions, c...",environment


In [3]:
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_lg")

text = df.text 

tokenizer = Tokenizer(nlp.vocab)

tokens = []
STOP_WORDS = nlp.Defaults.stop_words.union(['\n', '\n\n', 'The', 'I',])

for doc in tokenizer.pipe(df['text'], batch_size=500):
    doc_tokens = []
    
    for token in doc:
        if token.text not in STOP_WORDS:
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
    
df['tokens'] = tokens

In [5]:
df['tokens'].head()

0    [free, natural, parenting, magazine, -, positi...
1       [7, ways, keep, our, environment, clean, safe]
2    [with, environmental, issues, like, water, con...
3    [today, we’re, looking, steps, –, big, small, ...
4    [make, voice, heard:, vote,, sign, petitions,,...
Name: tokens, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

dtm = tfidf.fit_transform(df['text'])

docs = pd.DataFrame(dtm.todense(), columns = tfidf.get_feature_names())
docs.head()

Unnamed: 0,00,000,002,00am,015,018,02,02656710310482140,03,036,...,zeros,zone,zones,zoning,zorb,zürich,çukurova,émile,öztaş,üstün
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')

nn.fit(dtm.todense())

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [30]:
new = tfidf.transform(["Attend your city council"])
nn.kneighbors(new.todense())

(array([[0.54403572, 1.        , 1.        , 1.        , 1.        ]]),
 array([[  8, 396, 393,  45, 640]], dtype=int64))

In [31]:
df[df.index==8].text

8    Attend your city council meeting. 
Name: text, dtype: object

In [22]:
df.head(30)

Unnamed: 0.1,Unnamed: 0,text,label,tokens
0,0,Free Natural Parenting Magazine - Positive Par...,environment,"[free, natural, parenting, magazine, -, positi..."
1,1,7 Ways to Keep Our Environment Clean and Safe,environment,"[7, ways, keep, our, environment, clean, safe]"
2,2,With environmental issues like water contamina...,environment,"[with, environmental, issues, like, water, con..."
3,3,Today we’re looking at the steps – big and sma...,environment,"[today, we’re, looking, steps, –, big, small, ..."
4,4,"Make your voice heard: vote, sign petitions, c...",environment,"[make, voice, heard:, vote,, sign, petitions,,..."
5,5,Your state and local representatives need to h...,environment,"[your, state, local, representatives, need, he..."
6,6,Here are a few ways you can voice your concern...,environment,"[here, ways, voice, concerns, ask, corporation..."
7,7,Write a letter to your local newspaper.,environment,"[write, letter, local, newspaper.]"
8,8,Attend your city council meeting.,environment,"[attend, city, council, meeting.]"
9,9,Find out who your Representatives and Senators...,environment,"[find, representatives, senators, callmycongre..."
