In [1]:
# importing necessary libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from wordcloud import WordCloud 
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
import spacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#importing the dataset
amazon = pd.read_csv("E:/A/P/DS - Assignment Part 2 data set/amz_com-ecommerce_sample.csv", encoding="unicode_escape")
flipkart = pd.read_csv("E:/A/P/DS - Assignment Part 2 data set/flipkart_com-ecommerce_sample.csv", encoding="unicode_escape")

### Data Processing and Preparation 

In [3]:
#extracting necessary features in a new dataframe.

amz = amazon[['product_name','retail_price','discounted_price']]
flk = flipkart[['product_name','retail_price','discounted_price']]

# concating the title and description of the product as 'text'
amz["text"]=amazon['product_name']+amazon['description']

#dropping missing values present in text field
amz = amz.dropna(subset=['text'])

#resetting the indices
amz.reset_index(drop=True, inplace=True)

# performing teh same procedure for flipkart datframe as well
flk["text"]=flipkart['product_name']+flipkart['description']
flk = flk.dropna(subset=['text'])
flk.reset_index(drop=True, inplace=True)

In [4]:
sw=nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [5]:
#remove punctation marks from the text
def remove_punc(data):
    pattern = r'[' + string.punctuation + ']'
    data['text']=data['text'].map(lambda m:re.sub(pattern," ",str(m)))
    return data

# converting all characters into lower case
def lower(data):
    data['text1']=copdataydata['text'].map(lambda m:m.lower())
    return data

# tokenizing the text
def tokenization(text):
    tokens = re.split(' ',text)
    return tokens

def token(data):
    data['text']= data['text'].apply(lambda x: tokenization(x))
    return data

# removing stopwords
def remove_SW(data):
    data['text']=data['text'].apply(lambda x: [item for item in x if item not in sw])
    return data

# removing digits 
def remove_digits(data):
    data['text']=data['text'].apply(lambda x: [item for item in x if not item.isdigit()])
    return data

#lemmatization
def lemmatize(data):
    data['text']=data['text'].apply(lambda x: [lemmatizer.lemmatize(item) for item in x])
    return data

# removing empty tokens
def remove_empty_tokens(data):
    data['text']=data['text'].apply(lambda x: [item for item in x if item !=''])
    return data

# removing single characters
def remove_single_letters(data):
    data['text']=data['text'].apply(lambda x: [item for item in x if len(item) > 1])
    return data

# detokinizing
def detoken(data):
    data['text']= data['text'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))
    return data

#replacing empty spaces 
def replace_spaces(x,space,second):
    result = x.replace(space, second)
    return result

#removing spaces
def remove_space(data):
    data['text']= data['text'].apply(lambda x: replace_spaces(x,'  ',' '))
    return data

In [6]:
# pipeline function to process the data.
new_amz = amz.pipe(remove_punc).pipe(token).pipe(remove_SW).pipe(remove_digits).pipe(lemmatize).pipe(remove_empty_tokens).pipe(remove_single_letters).pipe(detoken).pipe(remove_space)
new_flk = flk.pipe(remove_punc).pipe(token).pipe(remove_SW).pipe(remove_digits).pipe(lemmatize).pipe(remove_empty_tokens).pipe(remove_single_letters).pipe(detoken).pipe(remove_space)

In [7]:
new_amz.head()

Unnamed: 0,product_name,retail_price,discounted_price,text
0,Alisha Solid Women's Cycling Shorts,982,438,Alisha Solid Women Cycling ShortsKey Features ...
1,FabHomeDecor Fabric Double Sofa Bed,32143,29121,FabHomeDecor Fabric Double Sofa BedFabHomeDeco...
2,AW Bellies,991,551,AW BelliesKey Features AW Bellies Sandals Wedg...
3,Alisha Solid Women's Cycling Shorts,694,325,Alisha Solid Women Cycling ShortsKey Features ...
4,Sicons All Purpose Arnica Dog Shampoo,208,258,Sicons All Purpose Arnica Dog ShampooSpecifica...


In [8]:
#exporting the cleaned dataframe
new_amz.to_csv("E:/A/P/internships/shack lab/DS - Assignment Part 2 data set/cleaned_amz.csv")
new_flk.to_csv("E:/A/P/internships/shack lab/DS - Assignment Part 2 data set/cleaned_flk.csv")

In [9]:
# dimenaion of the cleaned dataframe
print("Dimension of the cleaned Amazon dataset: " + str(new_amz['text'].shape))
print("Dimension of the cleaned Flipkart dataset: " + str(new_flk['text'].shape))

Dimension of the cleaned Amazon dataset: (19998,)
Dimension of the cleaned Flipkart dataset: (19998,)


In [10]:
#vectorizing the text using Tf-Idf Method

amz_X = new_amz['text']
flk_X = new_flk['text']

tfidf = TfidfVectorizer()

tfidf_amz = tfidf.fit_transform(amz_X)
tfidf_flk = tfidf.transform(flk_X)

In [11]:
#viewing the vectorized text as dataframe
pd.DataFrame(tfidf_flk.toarray(), columns= [tfidf.get_feature_names()]).head()

Unnamed: 0,000,0008m,000hrs,001,0010m,0011m,0018m,001flipkart,001pink,003,...,½to,½with,½ï,âº,âºc,â¼,â½,â¾,ãº,å¾ã
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
pd.DataFrame(tfidf_amz.toarray(), columns= [tfidf.get_feature_names()]).head()

Unnamed: 0,000,0008m,000hrs,001,0010m,0011m,0018m,001flipkart,001pink,003,...,½to,½with,½ï,âº,âºc,â¼,â½,â¾,ãº,å¾ã
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



#### I am taking a sample of 1000 from the dataframe, because it is computationally expensive to check each and every entry within themselves when the size of the dataset is large i.e., 2000. 

In [13]:
#getting the indices of the text with similarity score more than 0.95.
# the indices are saved in a list format as [i,j] where i is the index of the amazon product and j is of flipkart product

index=[]
for i in range(1001):
    for j in range(1001):
        matches = cosine_similarity(tfidf_amz[i], tfidf_flk[j])
        if matches == 1 or matches >0.95:
            index.append([i,j])

In [14]:
#viewing the first 10 elements of the list "index"
index[:10]

[[0, 0],
 [0, 6],
 [0, 13],
 [0, 15],
 [1, 1],
 [1, 7],
 [1, 16],
 [1, 19],
 [2, 2],
 [3, 3]]

In [15]:
len(index)

2635

In [16]:
# saving the product name, retail price and discount price of similar products in a dataframe.
result=pd.DataFrame()

for i,j in index:
    a=pd.concat([new_amz[['product_name', "retail_price", "discounted_price"]].iloc[[i]],new_flk[['product_name', "retail_price", "discounted_price"]].iloc[[i]]], axis=1)
    result = pd.concat([result,a])

In [17]:
result.head()

Unnamed: 0,product_name,retail_price,discounted_price,product_name.1,retail_price.1,discounted_price.1
0,Alisha Solid Women's Cycling Shorts,982,438,Alisha Solid Women's Cycling Shorts,999.0,379.0
0,Alisha Solid Women's Cycling Shorts,982,438,Alisha Solid Women's Cycling Shorts,999.0,379.0
0,Alisha Solid Women's Cycling Shorts,982,438,Alisha Solid Women's Cycling Shorts,999.0,379.0
0,Alisha Solid Women's Cycling Shorts,982,438,Alisha Solid Women's Cycling Shorts,999.0,379.0
1,FabHomeDecor Fabric Double Sofa Bed,32143,29121,FabHomeDecor Fabric Double Sofa Bed,32157.0,22646.0


In [18]:
result.shape

(2635, 6)