# Text Mining Project

Description

## 1. Installs and Imports

In [1]:
#pip install pandas
#!pip install langdetect
#!pip install matplotlib
#!pip install seaborn 
#!pip install re
#!pip install openpyxl

In [2]:
import pandas as pd
from langdetect import detect
import matplotlib.pyplot as plt
import seaborn as sns
import re

## 2. Data Exploration

**Checking if data was loaded Correctly**

In [3]:
train = pd.read_excel("train.xlsx")
train_reviews = pd.read_excel("train_reviews.xlsx")
test = pd.read_excel("test.xlsx")
test_reviews = pd.read_excel("test_reviews.xlsx")

In [4]:
train.head(5)

Unnamed: 0,index,description,host_about,unlisted
0,1,"This is a shared mixed room in our hostel, wit...",Alojamento Local Registro: 20835/AL,0
1,2,"O meu espaço fica perto de Parque Eduardo VII,...","I am friendly host, and I will try to always b...",1
2,3,Trafaria’s House is a cozy and familiar villa ...,"I am a social person liking to communicate, re...",1
3,4,"Apartamento Charmoso no Chiado, Entre o Largo ...",Hello!_x000D_\nI m Portuguese and i love to me...,0
4,5,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,0


In [5]:
train_reviews.head(5)

Unnamed: 0,index,comments
0,1,this is a very cozy and comfortable house to s...
1,1,good<br/>
2,1,"My first hostel experience, and all I have to ..."
3,1,Das Hostel war neu und deshalb funktionierte a...
4,1,"It was fine for a dorm, but I think for the pe..."


In [6]:
test.head(5)

Unnamed: 0,index,description,host_about
0,1,<b>The space</b><br />Apartment located in the...,"Gosto de viajar, de conhecer pessoas, gosto de..."
1,2,"IMPORTANT: In response to COVID-19, this prope...",We are the be@home Team!\n\nYou can count on u...
2,3,"Bright, beautiful, and spacious. This four-bed...","Hi there!\n\nWe're GuestReady, a professional ..."
3,4,Charming Apartment Close to the Bay of Cascais...,:)
4,5,"Se procura umas férias perto da praia, casino ...",Bem vindos a Portugal!_x000D_\nAdoro o meu Paí...


In [7]:
test_reviews.head(5)

Unnamed: 0,index,comments
0,1,Thank you very much Antonio ! All has been per...
1,1,Very nice appartment in the old town of Lissab...
2,1,When travelling we're looking for kids friendl...
3,1,We've been in Lisbon in march 2013 (3 adults a...
4,1,Our host Antonio was very helpful with informa...


**Everything looks fine**

In [8]:
print("train shape:", train.shape)
print("train_reviews shape:", train_reviews.shape)
print("test shape:", test.shape)
print("test_reviews shape:", test_reviews.shape)

train shape: (6248, 4)
train_reviews shape: (361281, 2)
test shape: (695, 3)
test_reviews shape: (41866, 2)


### 2.1 Index Fix

**In the dataset, the index column has repeating values, which likely indicate that each value corresponds to a specific house. To enhance clarity, we will rename this column to 'house ID'.**

In [18]:
# For train DataFrame
train['houseID'] = train.index  # Create 'houseID' column with index values
train.set_index('houseID', inplace=True)  # Set 'houseID' as the new index
train.drop(columns=['index'], inplace=True)  # Remove the column named 'index'

# For train_reviews DataFrame
train_reviews['houseID'] = train_reviews.index
train_reviews.set_index('houseID', inplace=True)
train_reviews.drop(columns=['index'], inplace=True)

# For test DataFrame
test['houseID'] = test.index
test.set_index('houseID', inplace=True)
test.drop(columns=['index'], inplace=True)

# For test_reviews DataFrame
test_reviews['houseID'] = test_reviews.index
test_reviews.set_index('houseID', inplace=True)
test_reviews.drop(columns=['index'], inplace=True)


KeyError: "['index'] not found in axis"

In [10]:
train.head(5)

Unnamed: 0,level_0,description,host_about,unlisted,houseID
0,0,"This is a shared mixed room in our hostel, wit...",Alojamento Local Registro: 20835/AL,0,1
1,1,"O meu espaço fica perto de Parque Eduardo VII,...","I am friendly host, and I will try to always b...",1,2
2,2,Trafaria’s House is a cozy and familiar villa ...,"I am a social person liking to communicate, re...",1,3
3,3,"Apartamento Charmoso no Chiado, Entre o Largo ...",Hello!_x000D_\nI m Portuguese and i love to me...,0,4
4,4,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,0,5


In [11]:
train_reviews.head(5)

Unnamed: 0,level_0,comments,houseID
0,0,this is a very cozy and comfortable house to s...,1
1,1,good<br/>,1
2,2,"My first hostel experience, and all I have to ...",1
3,3,Das Hostel war neu und deshalb funktionierte a...,1
4,4,"It was fine for a dorm, but I think for the pe...",1


In [12]:
test.head(5)

Unnamed: 0,level_0,description,host_about,houseID
0,0,<b>The space</b><br />Apartment located in the...,"Gosto de viajar, de conhecer pessoas, gosto de...",1
1,1,"IMPORTANT: In response to COVID-19, this prope...",We are the be@home Team!\n\nYou can count on u...,2
2,2,"Bright, beautiful, and spacious. This four-bed...","Hi there!\n\nWe're GuestReady, a professional ...",3
3,3,Charming Apartment Close to the Bay of Cascais...,:),4
4,4,"Se procura umas férias perto da praia, casino ...",Bem vindos a Portugal!_x000D_\nAdoro o meu Paí...,5


In [13]:
test_reviews.head(5)

Unnamed: 0,level_0,comments,houseID
0,0,Thank you very much Antonio ! All has been per...,1
1,1,Very nice appartment in the old town of Lissab...,1
2,2,When travelling we're looking for kids friendl...,1
3,3,We've been in Lisbon in march 2013 (3 adults a...,1
4,4,Our host Antonio was very helpful with informa...,1


### 2.2 Duplicates

**Cheacking for Duplicated Rows**

In [14]:
print("Number of duplicated rows in train DataFrame:", train.duplicated().sum())
print("Number of duplicated rows in train_reviews DataFrame:", train_reviews.duplicated().sum())
print("Number of duplicated rows in test DataFrame:", test.duplicated().sum())
print("Number of duplicated rows in test_reviews DataFrame:", test_reviews.duplicated().sum())

Number of duplicated rows in train DataFrame: 0
Number of duplicated rows in train_reviews DataFrame: 0
Number of duplicated rows in test DataFrame: 0
Number of duplicated rows in test_reviews DataFrame: 0


**Removing Duplicated Rows**

In [15]:
train.drop_duplicates(inplace=True)
train_reviews.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)
test_reviews.drop_duplicates(inplace=True)

print("Number of duplicated rows in train DataFrame after removal:", train.duplicated().sum())
print("Number of duplicated rows in train_reviews DataFrame after removal:", train_reviews.duplicated().sum())
print("Number of duplicated rows in test DataFrame after removal:", test.duplicated().sum())
print("Number of duplicated rows in test_reviews DataFrame after removal:", test_reviews.duplicated().sum())


Number of duplicated rows in train DataFrame after removal: 0
Number of duplicated rows in train_reviews DataFrame after removal: 0
Number of duplicated rows in test DataFrame after removal: 0
Number of duplicated rows in test_reviews DataFrame after removal: 0


### 2.3 Null Values

In [16]:
print("Number of null values in train DataFrame:")
print(train.isnull().sum())

print("\nNumber of null values in train_reviews DataFrame:")
print(train_reviews.isnull().sum())

print("\nNumber of null values in test DataFrame:")
print(test.isnull().sum())

print("\nNumber of null values in test_reviews DataFrame:")
print(test_reviews.isnull().sum())

Number of null values in train DataFrame:
level_0        0
description    0
host_about     0
unlisted       0
houseID        0
dtype: int64

Number of null values in train_reviews DataFrame:
level_0     0
comments    2
houseID     0
dtype: int64

Number of null values in test DataFrame:
level_0        0
description    0
host_about     0
houseID        0
dtype: int64

Number of null values in test_reviews DataFrame:
level_0     0
comments    0
houseID     0
dtype: int64


In [17]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
level_0,6248.0,3123.5,1803.786573,0.0,1561.75,3123.5,4685.25,6247.0
unlisted,6248.0,0.273367,0.445724,0.0,0.0,0.0,1.0,1.0
houseID,6248.0,3124.5,1803.786573,1.0,1562.75,3124.5,4686.25,6248.0
