In [33]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

import nltk.data
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [25]:
reviews = pd.read_csv('reviews.csv')

In [26]:
reviews.fillna('none', inplace=True)

In [27]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhinawtiwari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
stop_words = set(stopwords.words('english'))

In [29]:
def clean(review):
    letters_only = BeautifulSoup(review).get_text()
    letters_only = re.sub('[^a-zA-Z0-9]', ' ', letters_only)
    letters_only = letters_only.lower()
    words = letters_only.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

In [30]:
all_reviews = []
for review in tqdm(reviews['comments']):
    all_reviews.append(clean(review))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 243183/243183 [00:18<00:00, 13272.48it/s]


In [31]:
all_reviews[5]

'friend stayed tereas family first part trip ireland welcoming treated us like family house clean far dublin city center took less 10min walk luas station takes center helped us find things town gave us directions taught us lot irish culture even nice enough give us ride harbor look seals since rent car bit limited travels us really great experience definitely highlight trip even tried stay last portion trip sadly booked highly recommend staying kate kat'

In [34]:
tfidfvectorizer = TfidfVectorizer(analyzer='word', max_features=500)
reviews_tfidf = tfidfvectorizer.fit_transform(all_reviews)
reviews_tfidf = reviews_tfidf.toarray()

In [35]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), max_features=500)
reviews_vect = vectorizer.fit_transform(all_reviews)
reviews_vect = reviews_vect.toarray()

In [36]:
reviews_tfidf = pd.DataFrame(data=reviews_tfidf)


In [37]:
reviews_tfidf.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148649,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.180426,0.0,0.0,0.0,0.0,...,0.0,0.215103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23442,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.263218,0.0


In [38]:
reviews_tfidf.to_csv('reviews_tfidf_500.csv', index=False)