# Text Mining Project

Description

## 1. Installs and Imports

In [24]:
#pip install pandas
#!pip install langdetect
#!pip install matplotlib
#!pip install seaborn 
#!pip install re
#!pip install openpyxl
#!pip install googletrans==4.0.0-rc1

In [25]:
import pandas as pd
from langdetect import detect
import matplotlib.pyplot as plt
import seaborn as sns
import re
from googletrans import Translator


## 2. Data Exploration and Analysis

**Checking if data was loaded Correctly**

In [3]:
train = pd.read_excel("train.xlsx")
train_reviews = pd.read_excel("train_reviews.xlsx")
test = pd.read_excel("test.xlsx")
test_reviews = pd.read_excel("test_reviews.xlsx")

In [4]:
train.head(5)

Unnamed: 0,index,description,host_about,unlisted
0,1,"This is a shared mixed room in our hostel, wit...",Alojamento Local Registro: 20835/AL,0
1,2,"O meu espaço fica perto de Parque Eduardo VII,...","I am friendly host, and I will try to always b...",1
2,3,Trafaria’s House is a cozy and familiar villa ...,"I am a social person liking to communicate, re...",1
3,4,"Apartamento Charmoso no Chiado, Entre o Largo ...",Hello!_x000D_\nI m Portuguese and i love to me...,0
4,5,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,0


In [5]:
train_reviews.head(5)

Unnamed: 0,index,comments
0,1,this is a very cozy and comfortable house to s...
1,1,good<br/>
2,1,"My first hostel experience, and all I have to ..."
3,1,Das Hostel war neu und deshalb funktionierte a...
4,1,"It was fine for a dorm, but I think for the pe..."


In [6]:
test.head(5)

Unnamed: 0,index,description,host_about
0,1,<b>The space</b><br />Apartment located in the...,"Gosto de viajar, de conhecer pessoas, gosto de..."
1,2,"IMPORTANT: In response to COVID-19, this prope...",We are the be@home Team!\n\nYou can count on u...
2,3,"Bright, beautiful, and spacious. This four-bed...","Hi there!\n\nWe're GuestReady, a professional ..."
3,4,Charming Apartment Close to the Bay of Cascais...,:)
4,5,"Se procura umas férias perto da praia, casino ...",Bem vindos a Portugal!_x000D_\nAdoro o meu Paí...


In [7]:
test_reviews.head(5)

Unnamed: 0,index,comments
0,1,Thank you very much Antonio ! All has been per...
1,1,Very nice appartment in the old town of Lissab...
2,1,When travelling we're looking for kids friendl...
3,1,We've been in Lisbon in march 2013 (3 adults a...
4,1,Our host Antonio was very helpful with informa...


**Everything looks fine**

In [8]:
print("train shape:", train.shape)
print("train_reviews shape:", train_reviews.shape)
print("test shape:", test.shape)
print("test_reviews shape:", test_reviews.shape)

train shape: (6248, 4)
train_reviews shape: (361281, 2)
test shape: (695, 3)
test_reviews shape: (41866, 2)


### 2.1 Index Fix

**In the dataset, the index column has repeating values, which likely indicate that each value corresponds to a specific house. To enhance clarity, we will rename this column to 'house ID'.**

In [9]:
# For train DataFrame
train['houseID'] = train.index  # Create 'houseID' column with index values
train.set_index('houseID', inplace=True)  # Set 'houseID' as the new index
train.drop(columns=['index'], inplace=True)  # Remove the column named 'index'

# For train_reviews DataFrame
train_reviews['houseID'] = train_reviews.index
train_reviews.set_index('houseID', inplace=True)
train_reviews.drop(columns=['index'], inplace=True)

# For test DataFrame
test['houseID'] = test.index
test.set_index('houseID', inplace=True)
test.drop(columns=['index'], inplace=True)

# For test_reviews DataFrame
test_reviews['houseID'] = test_reviews.index
test_reviews.set_index('houseID', inplace=True)
test_reviews.drop(columns=['index'], inplace=True)


In [10]:
train.head(5)

Unnamed: 0_level_0,description,host_about,unlisted
houseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"This is a shared mixed room in our hostel, wit...",Alojamento Local Registro: 20835/AL,0
1,"O meu espaço fica perto de Parque Eduardo VII,...","I am friendly host, and I will try to always b...",1
2,Trafaria’s House is a cozy and familiar villa ...,"I am a social person liking to communicate, re...",1
3,"Apartamento Charmoso no Chiado, Entre o Largo ...",Hello!_x000D_\nI m Portuguese and i love to me...,0
4,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,0


In [11]:
train_reviews.head(5)

Unnamed: 0_level_0,comments
houseID,Unnamed: 1_level_1
0,this is a very cozy and comfortable house to s...
1,good<br/>
2,"My first hostel experience, and all I have to ..."
3,Das Hostel war neu und deshalb funktionierte a...
4,"It was fine for a dorm, but I think for the pe..."


In [12]:
test.head(5)

Unnamed: 0_level_0,description,host_about
houseID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,<b>The space</b><br />Apartment located in the...,"Gosto de viajar, de conhecer pessoas, gosto de..."
1,"IMPORTANT: In response to COVID-19, this prope...",We are the be@home Team!\n\nYou can count on u...
2,"Bright, beautiful, and spacious. This four-bed...","Hi there!\n\nWe're GuestReady, a professional ..."
3,Charming Apartment Close to the Bay of Cascais...,:)
4,"Se procura umas férias perto da praia, casino ...",Bem vindos a Portugal!_x000D_\nAdoro o meu Paí...


In [13]:
test_reviews.head(5)

Unnamed: 0_level_0,comments
houseID,Unnamed: 1_level_1
0,Thank you very much Antonio ! All has been per...
1,Very nice appartment in the old town of Lissab...
2,When travelling we're looking for kids friendl...
3,We've been in Lisbon in march 2013 (3 adults a...
4,Our host Antonio was very helpful with informa...


### 2.2 Duplicates

**Cheacking for Duplicated Rows**

In [14]:
print("Number of duplicated rows in train DataFrame:", train.duplicated().sum())
print("Number of duplicated rows in train_reviews DataFrame:", train_reviews.duplicated().sum())
print("Number of duplicated rows in test DataFrame:", test.duplicated().sum())
print("Number of duplicated rows in test_reviews DataFrame:", test_reviews.duplicated().sum())

Number of duplicated rows in train DataFrame: 192
Number of duplicated rows in train_reviews DataFrame: 6781
Number of duplicated rows in test DataFrame: 4
Number of duplicated rows in test_reviews DataFrame: 438


**Removing Duplicated Rows**

In [15]:
train.drop_duplicates(inplace=True)
train_reviews.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)
test_reviews.drop_duplicates(inplace=True)

print("Number of duplicated rows in train DataFrame after removal:", train.duplicated().sum())
print("Number of duplicated rows in train_reviews DataFrame after removal:", train_reviews.duplicated().sum())
print("Number of duplicated rows in test DataFrame after removal:", test.duplicated().sum())
print("Number of duplicated rows in test_reviews DataFrame after removal:", test_reviews.duplicated().sum())


Number of duplicated rows in train DataFrame after removal: 0
Number of duplicated rows in train_reviews DataFrame after removal: 0
Number of duplicated rows in test DataFrame after removal: 0
Number of duplicated rows in test_reviews DataFrame after removal: 0


### 2.3 Null Values

In [16]:
print("Number of null values in train DataFrame:")
print(train.isnull().sum())

print("\nNumber of null values in train_reviews DataFrame:")
print(train_reviews.isnull().sum())

print("\nNumber of null values in test DataFrame:")
print(test.isnull().sum())

print("\nNumber of null values in test_reviews DataFrame:")
print(test_reviews.isnull().sum())

Number of null values in train DataFrame:
description    0
host_about     0
unlisted       0
dtype: int64

Number of null values in train_reviews DataFrame:
comments    1
dtype: int64

Number of null values in test DataFrame:
description    0
host_about     0
dtype: int64

Number of null values in test_reviews DataFrame:
comments    0
dtype: int64


In [18]:
train_reviews[train_reviews['comments'].isnull()]

Unnamed: 0_level_0,comments
houseID,Unnamed: 1_level_1
97996,


In [20]:
train_reviews.dropna(subset=['comments'], inplace=True)
train_reviews[train_reviews['comments'].isnull()]

Unnamed: 0_level_0,comments
houseID,Unnamed: 1_level_1


In [22]:
train_original = train.copy()
train_reviews_original = train_reviews.copy()
test_original = test.copy()
test_reviews_original = test_reviews.copy()

### 2.4 Merge Datasets

In [26]:
train_merged = train.merge(train_reviews, on='houseID') 
test_merged = test.merge(test_reviews, on='houseID')

In [30]:
train_merged.head(5)

Unnamed: 0_level_0,description,host_about,unlisted,comments
houseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"This is a shared mixed room in our hostel, wit...",Alojamento Local Registro: 20835/AL,0,this is a very cozy and comfortable house to s...
1,"O meu espaço fica perto de Parque Eduardo VII,...","I am friendly host, and I will try to always b...",1,good<br/>
2,Trafaria’s House is a cozy and familiar villa ...,"I am a social person liking to communicate, re...",1,"My first hostel experience, and all I have to ..."
3,"Apartamento Charmoso no Chiado, Entre o Largo ...",Hello!_x000D_\nI m Portuguese and i love to me...,0,Das Hostel war neu und deshalb funktionierte a...
4,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,0,"It was fine for a dorm, but I think for the pe..."


In [29]:
test_merged.head(5)

Unnamed: 0_level_0,description,host_about,comments
houseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,<b>The space</b><br />Apartment located in the...,"Gosto de viajar, de conhecer pessoas, gosto de...",Thank you very much Antonio ! All has been per...
1,"IMPORTANT: In response to COVID-19, this prope...",We are the be@home Team!\n\nYou can count on u...,Very nice appartment in the old town of Lissab...
2,"Bright, beautiful, and spacious. This four-bed...","Hi there!\n\nWe're GuestReady, a professional ...",When travelling we're looking for kids friendl...
3,Charming Apartment Close to the Bay of Cascais...,:),We've been in Lisbon in march 2013 (3 adults a...
4,"Se procura umas férias perto da praia, casino ...",Bem vindos a Portugal!_x000D_\nAdoro o meu Paí...,Our host Antonio was very helpful with informa...


### 2.5 Language Problem

**We realised that not all the reviews were on the same language, so we decided to identify which language it was in**

In [32]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "Unknown"

In [33]:
train_merged["language"] = train_merged["description"].apply(detect_language)
test_merged["language"] = test_merged["description"].apply(detect_language)

## 3. Pre-Process

### 3.1 Cleaning Data

#### 3.1.1 Translating

### Feature Engineering ?