# GOOGLE APP STORE

#  IMPORT NECESSARY LIBRARIES

In [69]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# READING THE DATA

In [70]:
df = pd.read_csv('Apps_data+(1).csv', low_memory=False)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


# SHAPE AND INFO OF THE DATA

In [71]:
df.shape

(10841, 13)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


SUMMARY STATISTICS OF DATASET

In [73]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [74]:
df.describe(include='object')

Unnamed: 0,App,Category,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,10841,10841,10841,10840,10841,10840,10841,10841,10833,10838
unique,9660,34,6002,462,22,3,93,6,120,1378,2832,33
top,ROBLOX,FAMILY,0,Varies with device,"1,000,000+",Free,0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,596,1695,1579,10039,10040,8714,842,326,1459,2451


# DROPPING THE DUPLICATE RECORDS 

In [75]:
df.duplicated().sum()

483

In [76]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

# UNIQUE CATEGORIES OF THE COLUMN 'CATEGORY'

In [77]:
df['Category'].unique()

array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION',
       '1.9'], dtype=object)

# DROPPING INVALID CATEGORY'1.9'

In [78]:
category_to_drop = '1.9'
df = df[df['Category'] != category_to_drop]
df['Category'].unique()

array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION'],
      dtype=object)

# CHECKING NULL VALUES IN 'RATING'

In [79]:
df['Rating'].isnull().sum()

1465

In [80]:
df = df.dropna(subset=['Rating'])
df.Rating.isnull().sum()

0

# CREATING RATING_CATEGORY COLUMN

In [81]:
df['Rating_category'] = df['Rating'].apply(lambda x: 'High' if x > 3.5 else 'Low')
df['Rating_category']

0        High
1        High
2        High
3        High
4        High
         ... 
10834    High
10836    High
10837    High
10839    High
10840    High
Name: Rating_category, Length: 8892, dtype: object

In [82]:
df['Rating_category'].value_counts()

Rating_category
High    8012
Low      880
Name: count, dtype: int64

# DISTRIBUTION OF RATING_CATEGORY

In [None]:
sns.countplot(data=df,x='Rating_category')

<Axes: xlabel='Rating_category', ylabel='count'>

# CHANGING THE DATATYPE OF REVIEWS AND HANDLING OUTLIERS USING LOG TRANSFORMATION

In [None]:
df['Reviews'] = pd.to_numeric(df['Reviews'])
df['Reviews'].notnull().sum()

In [None]:
sns.boxplot(x='Reviews',data=df)
plt.title('Reviews_numeric')
plt.show()

In [None]:
df['Reviews_log'] = np.log1p(df['Reviews'])

In [None]:
sns.boxplot(x='Reviews_log',data=df)
plt.title('Reviews_log-Transformed')
plt.show()

# TREAT THE NON NUMERIC DATA AND CONVERT COLUMN INTO SUITABLE DTYPE

In [None]:
df = df[df['Size'] != 'Varies with device']
def convert_num(values):
    return values.replace({'M':'e6','k': 'e3'},regex=True).map(pd.eval).astype(float)
convert_num(df['Size'])
df['Size'] = convert_num(df['Size'])
df['Size']

# REMOVING  THE UNWANTED CHARACTERS AND CHANGING THE DATATYPE

In [None]:
df['Installs'] = df['Installs'].astype(str)
df['Installs'] = df['Installs'].str.replace('[+,]', '', regex=True).astype(int)
df

In [None]:
df['Price'].value_counts()

In [None]:
df['Price'] = df['Price'].astype(str)
df['Price'] = df['Price'].str.replace('$','')
df['Price'] = df['Price'].str.replace('.','')
df['Price']

# DROPPING THE COLUMNS (REDUNDANT FOR THE ANALYSIS)

In [None]:
df= df.drop(columns=['Rating','App','Genres','Last Updated','Current Ver', 'Android Ver'])
df

# ENCODE THE CATEGORICAL COLUMNS

In [None]:
df2= df.copy()
df2 = pd.get_dummies(data=df2,prefix='Category',drop_first=True,columns=['Category'])
df2 = pd.get_dummies(data=df2,prefix='Type',drop_first=True,columns=['Type'])
df2 = pd.get_dummies(data=df2,prefix='Content Rating',drop_first=True,columns=['Content Rating'])
df2

# SEGREGATING THE DATASET

In [None]:
target = df2['Rating_category']
independent_feature =df2.drop('Rating_category',axis=1)
target

In [None]:
independent_feature

# SPLITTING THE DATASET

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent_feature,target,test_size=0.3)

# STANDARDIZING THE DATASET

In [None]:
from sklearn.preprocessing import StandardScaler
num_columns=['Reviews','Size','Installs','Price']
std_sclr = StandardScaler()
df2[num_columns]=std_sclr.fit_transform(df2[num_columns])
df2

# HONEY PRODUCTION

# IMPORT NECESSARY LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings("ignore")

# READING DATASET

In [None]:
df1 = pd.read_csv('honeyproduction.csv', low_memory = False)
df1.head()

In [None]:
df1.shape

In [None]:
df1.info()

# PERCENTAGE DISTRIBUTION OF DATA IN EACH YEAR

In [None]:
df1['year'].value_counts()

In [None]:
year_values = [44,44,44,43,43,43,41,41,41,41,41,40,40,40,40]

years = [2001,2002,2003,1998,1999,2000,2004,2005,2006,2007,2008,2009,2010,2011,2012]
plt.figure(figsize=(12,10))
plt.pie(year_values, labels= years,autopct='%1.1f%%', startangle=90, shadow =False)
plt.title('Percentage Distribution by Year')
plt.axis('equal')
plt.show()

# DISTRIBUTION OF PRICE PER LB USING DISPLOT

In [None]:
sns.displot(data=df1,x="priceperlb",kde=True,color='blue')


Distribution of the variable"priceperlb" is right-skewed , meaning the most of the values are concentrated on the left side of the plot and there are some outliers on the right
 
Mean value of the variable is slightly higher  than the median , indicating that the distribution is not symmetric



# SCATTER PLOT (NUMCOL AND PRODVALUE)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(df1['numcol'],df1['prodvalue'],c='blue',marker='o')
plt.title('Relationship between numcol and prodvalue')
plt.xlabel('numcol')
plt.ylabel('prodvalue')
plt.show()

The relationship between numcol and prodval is positive, meaning that as the number of honey producing colony increases, the production value also increases.

The relationship also linear,meaning that there is a constant rate of change between the two variables.

The relationship is strong. because, the points are close to the line of best fit.


# BOXPLOT ( YEAR AND PRODVALUE)

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(data=df1,x='year',y='prodvalue')
plt.show()

The production value of honey varies over the years.

The relationship between  year and production value is non linear, meaning that the rate of change of production value is not constant across the years.

The year 2012 has less outliers comparing all the other years.

# PAIRPLOT

In [None]:
sns.pairplot(data= df1[['numcol','yieldpercol','totalprod','prodvalue','year']]);

The variable numcol (number of colonies) has a positive relationship with total prod (total production) and prodvalue (production value), meaning that as the number of colonies increases, the total production and value also increase.

The variable yield percol (yield per colony) has a negative relationship with numcol, meaning that as the number of colonies increases, the yield per colony decreases.

# HEATMAP

In [None]:
sns.heatmap(data=df1[['numcol','yieldpercol','totalprod','stocks','priceperlb','prodvalue']].corr(), annot=True, cbar= False);

The Total production and Number of colonies has strong  positive correlation, meaning that the value is near to 1.

The Priceperlb and Yieldpercol has strong negative correlation, meaning that the value is near to -1.


