In [None]:
## code for classifying apps based on category

#Required libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

cwd = os.getcwd() #Copy current working directory

#df = pd.read_csv(cwd + '/All Categories.csv',index_col=0  ) # Read csv
df = pd.read_csv('/kaggle/input/google-play-store-category-wise-top-500-apps/All Categories.csv',index_col=0  ) # Read csv

df.head() # First 5 rows

data = df.copy() # Copy to another DataFrame

data.info() # DataFrame information

data = data[data.Reviews.notna()] # Drop NaN value

data.info()

data.drop('Size',axis=1, inplace=True) # Drop Size Column

data.Reviews.unique() # Unique values of the column Reviews

data.Downloads.unique() # Unique values of the column Downloads

# Abbreviations

# T = 1000            (K)
# L = 100000 
# Cr = 10000000       (10M)
# TCr = 10000000000   (10B)

data[data.Downloads.str.contains('TCr', regex=False)]   # YouTube only app with more than 10 Billion Downloads == 1TCr+

### Changing abbreviations of Reviews

##### Change current abbreviations to (K(Thousand), M(Million)) and (K(Thousand), M(Million), B(Billion)) in the case of Downloads
#### Creating two columns, one with the new abbreviations and one in units.

Lista_num = [] # Numbers saved as str
Lista_str = [] # Abbreviations

for value in data.Reviews:
    if value.endswith('T') == True: 
        Lista_num.append(value[0:-1])
        Lista_str.append(value[-1])
    elif value.endswith('L') == True:
        Lista_num.append(value[0:-1])
        Lista_str.append(value[-1])
    elif value.endswith('Cr') == True:
        Lista_num.append(value[0:-2])
        Lista_str.append(value[-2:])
    else:
        Lista_num.append(value)
        Lista_str.append('')

Lista_str # List of Abbreviations


np.unique(np.array(Lista_str)) # Unique values of the list

# Converting abbreviations to units

# T = 1000     (K)
# L = 100000 
# Cr = 10000000       (10M)

Lista_str_to_num = []

for value in Lista_str:
    if value == 'T': 
        Lista_str_to_num.append(1000)
    elif value == 'L':
        Lista_str_to_num.append(100000)
    elif value == 'Cr':
        Lista_str_to_num.append(10000000)
    else:
        Lista_str_to_num.append(1)


Lista_str_to_num # List of numbers converted from the abb

# Converting numbers saved as str to integers
int_output = map(int, Lista_num) #Maps each string to an int.
integer_list = list(int_output) #Converts mapped output to a list of ints.
print(integer_list[0:20])

Reviews_unit = np.array(Lista_str_to_num) * np.array(integer_list)
Reviews_unit      # Reviews in units

# Creating a List of Reviews with abbreviations (K(Thousand), M(Million))
Reviews = []

for v in Reviews_unit:
    if v/1000000 >= 1:
        Reviews.append(f'{round(v/1000000,1)}M')
    elif v/1000 >= 1: 
        Reviews.append(f'{int(v/1000)}K')
    else:
        Reviews.append(f'{v}') 

Reviews        # Reviews with different abbreviattions 

np.unique(np.array(Reviews))

Reviews_Abb = np.array(Reviews)  # Reviews with abbreviations saved as a numpy array 

### Changing abbreviations of Downloads

# Same process of changing the abbreviations with the column of Downloads
data.Downloads.unique()


Lista_num_downloads = []
Lista_str_downloads = []

for value in data.Downloads:
    if value.endswith('T+') == True: 
        Lista_num_downloads.append(value[0:-2])
        Lista_str_downloads.append(value[-2:])
    elif value.endswith('L+') == True:
        Lista_num_downloads.append(value[0:-2])
        Lista_str_downloads.append(value[-2:])
    elif value.endswith('TCr+') == True:
        Lista_num_downloads.append(value[0:-4])
        Lista_str_downloads.append(value[-4:])
    else:  #Cr+
        Lista_num_downloads.append(value[0:-3])
        Lista_str_downloads.append(value[-3:])
  

np.unique(np.array(Lista_str_downloads))

# T = 1000            (K)
# L = 100000 
# Cr = 10000000       (10M)
# TCr = 10000000000   10.000.000.000  (10B)

Lista_str_to_num_downloads = []

for value in Lista_str_downloads:
    if value == 'T+': 
        Lista_str_to_num_downloads.append(1000)
    elif value == 'L+':
        Lista_str_to_num_downloads.append(100000)
    elif value == 'TCr+':
        Lista_str_to_num_downloads.append(10000000000)
    else:   #Cr+
        Lista_str_to_num_downloads.append(10000000)
   

np.unique(np.array(Lista_str_to_num_downloads))


int_output = map(int, Lista_num_downloads) #Maps each string to an int.
int_list = list(int_output) #Converts mapped output to a list of ints.
print(int_list[0:20])

Downloads_unit = np.array(Lista_str_to_num_downloads) * np.array(int_list)
Downloads_unit     # array of Downloads in units

Downloads = []
 
for v in Downloads_unit:
    if v/1000000000 >= 1:
        Downloads.append(f'{int(v/1000000000)}B+')
    elif v/1000000 >= 1:
        Downloads.append(f'{int(v/1000000)}M+')
    else: 
        Downloads.append(f'{int(v/1000)}K+')  

Downloads        

np.unique(np.array(Downloads)) # Unique Download values


Downloads_Abb = np.array(Downloads)  # array of Downloads with new abbreviattions 

### Adding the new columns

data.head()

# Adding Review Columns
data['Reviews_Abb'] = Reviews_Abb  
data['Reviews_unit'] = Reviews_unit

# Adding Download Columns
data['Downloads_Abb'] = Downloads_Abb  
data['Downloads_unit'] = Downloads_unit

data.head()

### Creating new Dataframe with complete data

data.columns

# New order of columns
columns = ['Name', 'Developer', 'Category', 'Star Rating', 'Reviews_Abb', 'Reviews_unit', 'Downloads_Abb',
       'Downloads_unit','Rated for']

all_data = data[columns] # Creating new df called all_data

all_data.rename(columns={'Star Rating':'Star_Rating','Rated for':'Rated_for'}, inplace=True) #rename columns with blank spaces

all_data.head()

## Visualizations

### Rating Distribution

plt.style.use('bmh')
plt.figure(figsize=(14,6))

media_rating =all_data.Star_Rating.mean() # 

plt.axvline(x = media_rating , color="b", ls = '-',label="Average Rating")

plt.hist(all_data.Star_Rating, color='g')

plt.xticks(rotation=45)

plt.title("Star Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Absolute frequency")

plt.legend(loc = "upper left")


plt.show()

### Top 10 Categories by number of apps in the ranking

category_count = all_data.Category.value_counts()[:10]
category_count

plt.style.use('ggplot')
plt.figure(figsize=(14,6))

plt.barh(category_count.index,category_count , color='g')
plt.title("Top 10 Categories by number of apps in the ranking",{'fontsize':20},pad=20)
plt.xlabel("Number of Apps")
plt.gca().invert_yaxis()

plt.show()

### Correlation between Star Rating and Number of Downloads?

fig = px.scatter(all_data, x="Star_Rating", y="Downloads_unit")
fig.show()

df_no_outliers = all_data[all_data.Downloads_unit <5000000000]   # Drop Outliers
fig = px.scatter(df_no_outliers, x="Star_Rating", y="Downloads_unit")  
fig.show()

#    There is no correlation     Correlation ~ 0  
# Numpy
print(np.corrcoef(all_data.Star_Rating ,all_data.Downloads_unit)[0,1])
# Pandas
print(all_data.Star_Rating.corr(all_data.Downloads_unit)  )

### Apps with most downloads 

# TOP 20 Apps with most downloads
all_data.sort_values('Downloads_unit',ascending=False).iloc[0:20,np.r_[0:2,3,6]]

### Top 10 Developers by number of Apps in Ranking

apps_by_devs = all_data.Developer.value_counts()[:10]  # Number of apps in the ranking by Developers
apps_by_devs

apps_by_devs.index 

fig = px.bar(apps_by_devs, y=apps_by_devs.index , x=apps_by_devs, text=apps_by_devs, orientation='h')

fig.update_traces(textfont_size=12, marker_color='green',textangle=0, textposition="inside", cliponaxis=False)

fig.update_layout(
    title={
        'text':"Top 10 Developers by number of Apps in Ranking",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'} , 
    yaxis_title = None,
    xaxis_title="Number of Apps",
    yaxis={'categoryorder':'total ascending'},
    font=dict(
        family="Arial, monospace",
        size=14,
        color="Green",
    )
)

fig.show()

### Top 10 Developers by number of Downloads

# Developers with the most Downloads
Downloads_devs_10 = all_data.groupby(['Developer']).sum()['Downloads_unit'].sort_values(ascending=False)[0:10]
Downloads_devs_10

sns.set_theme(style="darkgrid")

fig, ax = plt.subplots(figsize=(14,6))

ax = sns.barplot(x=Downloads_devs_10, y=Downloads_devs_10.index, ax=ax, color='g', errwidth=0)

ax.set_title('Top 10 Developers by number of Downloads', {'fontsize':20},pad=20)
ax.set_xlabel("Downloads (Tens of Billions)",{'fontsize':14})
ax.set(ylabel=None)

plt.show()


In [2]:
## code for filtering classification cases without a match 

import pandas as pd
import json
import re

categories_df=pd.read_csv("drive/MyDrive/All Categories.csv")
victor_df=pd.read_csv("drive/MyDrive/Sample - Activities - Hoja 1.csv")

# from google.colab import drive
# drive.mount('/content/drive')

print (categories_df.columns)

categories_df=categories_df[['Name', 'Category']]
categories_df.head()

print (victor_df.columns)

activity_switching_df=victor_df[victor_df['EventName']== 'FocusIn']
activity_switching_df=activity_switching_df[['EventName','Parameters']]
print (activity_switching_df)

for item in activity_switching_df['Parameters']:
  json_params=json.loads(item.strip("[]"))
  print (json_params)
  app_name=json_params["AppName"]
  cat_item=categories_df[categories_df['Name']==app_name]
  print (cat_item)

print (categories_df.columns)

FileNotFoundError: [Errno 2] No such file or directory: 'drive/MyDrive/All Categories.csv'

In [None]:
## code for matching apps to pre-labeled categories 

# Text Classification

## 0 - Setup

### Install libraries

!pip install transformers datasets gdown

### Import libraries

import re
import torch
import gdown
import transformers
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TextClassificationPipeline

## 1 - Example using an existing sentiment analysis dataset

### Training the model

# Load the IMDB dataset
imdb_dataset = load_dataset("imdb")

# Define the model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

imdb_dataset = imdb_dataset.map(tokenize, batched=True, batch_size=len(imdb_dataset["train"]))
imdb_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataset = imdb_dataset['train'].train_test_split(test_size=0.2)['train']
test_dataset = imdb_dataset['train'].train_test_split(test_size=0.2)['test']
val_dataset = test_dataset.train_test_split(test_size=0.5)['test']

training_args = TrainingArguments(
  output_dir=".",
  num_train_epochs=3,              # total number of training epochs
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=64,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  logging_steps=10,
)

trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()
trainer.save_model("imdb_classification")

### Evaluating the model

from sklearn.metrics import classification_report
# Tokenize and encode the input
input_text = "This movie was fantastic! I really enjoyed it."
inputs = tokenizer(input_text, return_tensors="pt")

# Perform classification
result = trainer.predict(test_dataset)
labels = np.argmax(result.predictions, axis=1)
print(classification_report(test_dataset['label'], labels, target_names=['negative', 'positive']))

## 2 - Website Example

### Obtaining the dataset

url = "https://drive.google.com/uc?id=12c0cBC3U2kR976EZrlas4DJiOAC-z58u"
filename = "website_categories.csv"
gdown.download(url, filename, quiet=False)

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(filename)

# Split the data into training, testing, and validation sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Write each set to a new CSV file
train_file = 'train_' + filename
test_file = 'test_' + filename
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)

### Data Cleaning

def extract_words_from_url(url):
    url = re.sub(r'https?://', '', url)  # Remove 'http://' or 'https://'
    url = re.sub(r'www\.', '', url)  # Remove 'www.'
    url = re.sub(r'\.[a-zA-Z]+', '', url)  # Remove domain extension
    url = re.sub(r'[-_/]', ' ', url)  # Replace '-', '_', and '/' with spaces
    words = re.findall(r'\b\w+\b', url)  # Extract words
    return ' '.join(words).strip()

def add_fields(df_tmp, le):
  df_tmp['text'] = df_tmp['website_url'].apply(extract_words_from_url)
  df_tmp['text'] += ' ' + df_tmp['cleaned_website_text']
  df_tmp['label'] = le.transform(df_tmp['Category'])
  return df_tmp

le = LabelEncoder()
le.fit(df['Category'])
classes = le.classes_
num_categories = len(classes)
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)
df_train = add_fields(df_train, le)
df_test = add_fields(df_test, le)
df_train.to_csv(train_file, index=False)
df_test.to_csv(test_file, index=False)

### Training the model

# Load the dataset from CSV files
dataset = load_dataset("csv", data_files={"train": train_file, "test": test_file})

# Define the model and tokenizer
# model_name = "microsoft/deberta-base"
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_categories)

# Tokenize the dataset
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset["train"]))
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataset = dataset['train'].train_test_split(test_size=0.2)['train']
test_dataset = dataset['train'].train_test_split(test_size=0.2)['test']
val_dataset = test_dataset.train_test_split(test_size=0.5)['test']

training_args = TrainingArguments(
  output_dir=".",
  num_train_epochs=3,              # total number of training epochs
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=64,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  logging_steps=10,
)

trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()
trainer.save_model("website_classification")

### Evaluating the model

Evaluate the entire model

# Perform classification
result = trainer.predict(test_dataset)
labels = np.argmax(result.predictions, axis=1)
print(classification_report(test_dataset['label'], labels, target_names=classes))

Evaluate a single string

model = AutoModelForSequenceClassification.from_pretrained('/content/website_classification/')
tokenizer = AutoTokenizer.from_pretrained(model_name)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False)
print(classes)
pipe("youtube video learn")

## 3 - Applications classification

### 3.1 - Obtaining the dataset

url = "https://drive.google.com/uc?id=16gad7p-qxRxEoo_6r80oRYQOr3bFioQ8"
filename = "apps_categories.csv"
gdown.download(url, filename, quiet=False)

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(filename)

# Split the data into training, testing, and validation sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Write each set to a new CSV file
train_file = 'train_' + filename
test_file = 'test_' + filename
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)

### 3.2 - Transforming the data

category_column_name = 'Category'
le = LabelEncoder()
le.fit(df[category_column_name])

def add_fields(df_tmp, le):
  df_tmp['text'] = df_tmp['Name'] # Please define the columns to add, this will be a long text separated by spaces
  df_tmp['label'] = le.transform(df_tmp[category_column_name])
  return df_tmp

classes = le.classes_
num_categories = len(classes)
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)
df_train = add_fields(df_train, le)
df_test = add_fields(df_test, le)
df_train.to_csv(train_file, index=False)
df_test.to_csv(test_file, index=False)

### 3.3 - Training the model

# Load the dataset from CSV files
dataset = load_dataset("csv", data_files={"train": train_file, "test": test_file})

# Define the model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_categories)

# Tokenize the dataset
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset["train"]))
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataset = dataset['train'].train_test_split(test_size=0.2)['train']
test_dataset = dataset['train'].train_test_split(test_size=0.2)['test']
val_dataset = test_dataset.train_test_split(test_size=0.5)['test']

training_args = TrainingArguments(
  output_dir=".",
  num_train_epochs=3,              # total number of training epochs
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=64,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  logging_steps=10,
)

trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()
trainer.save_model("apps_classification")

### 3.4 - Evaluating the model

Evaluate the entire model

# Perform classification
result = trainer.predict(test_dataset)
labels = np.argmax(result.predictions, axis=1)
print(classification_report(test_dataset['label'], labels))

Evaluate a single String

model = AutoModelForSequenceClassification.from_pretrained('/content/apps_classification/')
tokenizer = AutoTokenizer.from_pretrained(model_name)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False)
print(classes)
pipe("Microsoft Office Word")