In [1]:
import pandas as pd

In [2]:
Gallery = pd.read_csv("AmericanGallery.csv")

In [3]:
Gallery.columns

Index(['Title', 'Artist', 'Nationality', 'BeginDate', 'EndDate', 'Gender',
       'Date', 'Department'],
      dtype='object')

In [4]:
# The column names of dataset are ambigious hence renaming them for better understanding
Gallery.rename({'Title':'Art_Title'},axis=1,inplace=True)
Gallery.rename({'BeginDate':'Birth'},axis=1,inplace=True)
Gallery.rename({'EndDate':'Death'},axis=1,inplace=True)
Gallery.rename({'Date':'Submission_Date'},axis=1,inplace=True)
Gallery

Unnamed: 0,Art_Title,Artist,Nationality,Birth,Death,Gender,Submission_Date,Department
0,Dress MacLeod from Tartan Sets,Sarah Charlesworth,(American),-1947.0,-2013.0,(Female),1986,Prints & Illustrated Books
1,Duplicate of plate from folio 11 verso (supple...,Pablo Palazuelo,(Spanish),-1916.0,-2007.0,(Male),1978,Prints & Illustrated Books
2,Tailpiece (page 55) from SAGESSE,Maurice Denis,(French),-1870.0,-1943.0,(Male),1889-1911,Prints & Illustrated Books
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Aristide Maillol,(French),-1861.0,-1944.0,(Male),1927-1940,Prints & Illustrated Books
4,97 rue du Bac,Eugène Atget,(French),-1857.0,-1927.0,(Male),1903,Photography
...,...,...,...,...,...,...,...,...
16724,Oval with Points,Henry Moore,(British),-1898.0,-1986.0,(Male),1968-1969,Painting & Sculpture
16725,"Cementerio de la Ciudad Abierta, Ritoque, Chile",Juan Baixas,(Chilean),-1942.0,,(Male),1975,Architecture & Design
16726,The Catboat,Edward Hopper,(American),-1882.0,-1967.0,(Male),1922,Prints & Illustrated Books
16727,Dognat' i peregnat' v tekhniko-ekonomicheskom ...,Unknown,(),,,(),1931,Prints & Illustrated Books


In [5]:
# Rearranging columns sequence so that information is easier to process
Gallery = Gallery[["Art_Title","Department","Submission_Date","Artist","Nationality","Gender","Birth","Death"]]
Gallery

Unnamed: 0,Art_Title,Department,Submission_Date,Artist,Nationality,Gender,Birth,Death
0,Dress MacLeod from Tartan Sets,Prints & Illustrated Books,1986,Sarah Charlesworth,(American),(Female),-1947.0,-2013.0
1,Duplicate of plate from folio 11 verso (supple...,Prints & Illustrated Books,1978,Pablo Palazuelo,(Spanish),(Male),-1916.0,-2007.0
2,Tailpiece (page 55) from SAGESSE,Prints & Illustrated Books,1889-1911,Maurice Denis,(French),(Male),-1870.0,-1943.0
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Prints & Illustrated Books,1927-1940,Aristide Maillol,(French),(Male),-1861.0,-1944.0
4,97 rue du Bac,Photography,1903,Eugène Atget,(French),(Male),-1857.0,-1927.0
...,...,...,...,...,...,...,...,...
16724,Oval with Points,Painting & Sculpture,1968-1969,Henry Moore,(British),(Male),-1898.0,-1986.0
16725,"Cementerio de la Ciudad Abierta, Ritoque, Chile",Architecture & Design,1975,Juan Baixas,(Chilean),(Male),-1942.0,
16726,The Catboat,Prints & Illustrated Books,1922,Edward Hopper,(American),(Male),-1882.0,-1967.0
16727,Dognat' i peregnat' v tekhniko-ekonomicheskom ...,Prints & Illustrated Books,1931,Unknown,(),(),,


In [6]:
Gallery.dtypes

Art_Title           object
Department          object
Submission_Date     object
Artist              object
Nationality         object
Gender              object
Birth              float64
Death              float64
dtype: object

In [7]:
# Removing unnecessary characters in following columns
Gallery["Nationality"] = Gallery["Nationality"].str.replace('(','').str.replace(')','')
Gallery["Gender"] = Gallery["Gender"].str.replace('(','').str.replace(')','')
Gallery["Birth"] = Gallery["Birth"].astype(str).str.replace('-','').str.replace('-','').astype(float)
Gallery["Death"] = Gallery["Death"].astype(str).str.replace('-','').str.replace('-','').astype(float)
Gallery

Unnamed: 0,Art_Title,Department,Submission_Date,Artist,Nationality,Gender,Birth,Death
0,Dress MacLeod from Tartan Sets,Prints & Illustrated Books,1986,Sarah Charlesworth,American,Female,1947.0,2013.0
1,Duplicate of plate from folio 11 verso (supple...,Prints & Illustrated Books,1978,Pablo Palazuelo,Spanish,Male,1916.0,2007.0
2,Tailpiece (page 55) from SAGESSE,Prints & Illustrated Books,1889-1911,Maurice Denis,French,Male,1870.0,1943.0
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Prints & Illustrated Books,1927-1940,Aristide Maillol,French,Male,1861.0,1944.0
4,97 rue du Bac,Photography,1903,Eugène Atget,French,Male,1857.0,1927.0
...,...,...,...,...,...,...,...,...
16724,Oval with Points,Painting & Sculpture,1968-1969,Henry Moore,British,Male,1898.0,1986.0
16725,"Cementerio de la Ciudad Abierta, Ritoque, Chile",Architecture & Design,1975,Juan Baixas,Chilean,Male,1942.0,
16726,The Catboat,Prints & Illustrated Books,1922,Edward Hopper,American,Male,1882.0,1967.0
16727,Dognat' i peregnat' v tekhniko-ekonomicheskom ...,Prints & Illustrated Books,1931,Unknown,,,,


In [8]:
Gallery['Submission_Date'].unique()

array(['1986', '1978', '1889-1911', ..., '1936-1938', '(c. 1955-1959)',
       'c. 1919-1920'], dtype=object)

In [9]:
bad_char = ['(',')','.',' ','C','c','S','s',"'",'|']
translation_table = str.maketrans('','',''.join(bad_char))
#Translation table using the str.maketrans() method is created. 
#The first 2 arguments '', indicate that each char in the input string should be replaced with nothing 
#The third argument ''.join(bad_char) is a str containing all char from the bad_char list joined together
#The joined str specifies char that should be removed
Gallery['Submission_Date'] = Gallery['Submission_Date'].apply(lambda x: x.translate(translation_table))
# apply() applies the lambda function to each element in the 'Submission_Date' column
Gallery

Unnamed: 0,Art_Title,Department,Submission_Date,Artist,Nationality,Gender,Birth,Death
0,Dress MacLeod from Tartan Sets,Prints & Illustrated Books,1986,Sarah Charlesworth,American,Female,1947.0,2013.0
1,Duplicate of plate from folio 11 verso (supple...,Prints & Illustrated Books,1978,Pablo Palazuelo,Spanish,Male,1916.0,2007.0
2,Tailpiece (page 55) from SAGESSE,Prints & Illustrated Books,1889-1911,Maurice Denis,French,Male,1870.0,1943.0
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Prints & Illustrated Books,1927-1940,Aristide Maillol,French,Male,1861.0,1944.0
4,97 rue du Bac,Photography,1903,Eugène Atget,French,Male,1857.0,1927.0
...,...,...,...,...,...,...,...,...
16724,Oval with Points,Painting & Sculpture,1968-1969,Henry Moore,British,Male,1898.0,1986.0
16725,"Cementerio de la Ciudad Abierta, Ritoque, Chile",Architecture & Design,1975,Juan Baixas,Chilean,Male,1942.0,
16726,The Catboat,Prints & Illustrated Books,1922,Edward Hopper,American,Male,1882.0,1967.0
16727,Dognat' i peregnat' v tekhniko-ekonomicheskom ...,Prints & Illustrated Books,1931,Unknown,,,,


In [10]:
Gallery['Submission_Date'] = Gallery['Submission_Date'].apply(lambda x: x.lstrip('-') \
                                                              if x.startswith('-') else x) 

In [11]:
## Filter the rows that need correction
inaccurate_date = Gallery['Submission_Date'][Gallery['Submission_Date'].str.contains('-')\
                                             & (Gallery['Submission_Date'].str.len()<9)]
inaccurate_date

6021     1910-30
12495    1910-30
14187    1880-90
16003    1910-30
Name: Submission_Date, dtype: object

In [12]:
Gallery.iloc[6020:6030,0:8]

Unnamed: 0,Art_Title,Department,Submission_Date,Artist,Nationality,Gender,Birth,Death
6020,"Bombed Regency Staircase, Upper Brook Street, ...",Photography,1942,Bill Brandt,British,Male,1904.0,1983.0
6021,15 postcards of Tiflis (c.1900-30) and one of ...,Prints & Illustrated Books,1910-30,Unknown,,,,
6022,Untitled from Kartinki-Voina russkikh s nemtsa...,Prints & Illustrated Books,1914,Unknown,,,,
6023,Galloping Horse with Jockey,Photography,1884-1886,Eadweard J. Muybridge,American,Male,1830.0,1904.0
6024,"""The Warsaw Ghetto""",Photography,1943,Associated Press,,,,
6025,M. Zhuk (Ukrainian painting series),Prints & Illustrated Books,1930,Anatol' Petryts'kyi,Ukrainian,Male,1895.0,1964.0
6026,Pictorial initial U (page 33) from REVENAR,Prints & Illustrated Books,1936,Max Jiménez,Costa Rican,Male,1908.0,1949.0
6027,Snihy. Poezii,Prints & Illustrated Books,1925,Evgen Mei,,Male,,
6028,Plate (page 9) from THE PASTORAL LOVES OF DAPH...,Prints & Illustrated Books,1934,Ruth Reeves,American,Female,1892.0,1966.0
6029,"Sher-e-Bangla Nagar, Capital of Bangladesh, Dh...",Architecture & Design,1962-1983,Louis I. Kahn,American,Male,1901.0,1974.0


In [13]:
#Function to correct the inaccurate dates
def accurate_date(x):
    start_year, end_year = x.split('-')
    end_year = start_year[0:2] + end_year
    return f"{start_year}-{end_year}"

In [14]:
# Apply the accurate_date function to each element in the Series
Gallery.loc[inaccurate_date.index, 'Submission_Date'] = inaccurate_date.apply(accurate_date)

In [15]:
Gallery.iloc[6020:6025]

Unnamed: 0,Art_Title,Department,Submission_Date,Artist,Nationality,Gender,Birth,Death
6020,"Bombed Regency Staircase, Upper Brook Street, ...",Photography,1942,Bill Brandt,British,Male,1904.0,1983.0
6021,15 postcards of Tiflis (c.1900-30) and one of ...,Prints & Illustrated Books,1910-1930,Unknown,,,,
6022,Untitled from Kartinki-Voina russkikh s nemtsa...,Prints & Illustrated Books,1914,Unknown,,,,
6023,Galloping Horse with Jockey,Photography,1884-1886,Eadweard J. Muybridge,American,Male,1830.0,1904.0
6024,"""The Warsaw Ghetto""",Photography,1943,Associated Press,,,,


In [16]:
#Filter the dates that are in 1xxx-1xxx format
filter_Dates = Gallery["Submission_Date"][Gallery["Submission_Date"].str.contains('-')\
                                         & (Gallery["Submission_Date"].str.len()==9)]
filter_Dates

2        1889-1911
3        1927-1940
7        1978-1983
10       1949-1950
12       1908-1911
           ...    
16705    1889-1911
16706    1880-1910
16707    1945-1951
16709    1964-1965
16724    1968-1969
Name: Submission_Date, Length: 3066, dtype: object

In [17]:
#Function to take the mean value for filter_Dates
def mean_date(x):
    start_year, end_year = map(int, x.split('-'))
    mean_value = (start_year + end_year) / 2
    return mean_value

In [18]:
# Apply the mean_date function to each element in the Series
Gallery.loc[filter_Dates.index, 'Submission_Date'] = filter_Dates.apply(mean_date)

In [19]:
Gallery

Unnamed: 0,Art_Title,Department,Submission_Date,Artist,Nationality,Gender,Birth,Death
0,Dress MacLeod from Tartan Sets,Prints & Illustrated Books,1986,Sarah Charlesworth,American,Female,1947.0,2013.0
1,Duplicate of plate from folio 11 verso (supple...,Prints & Illustrated Books,1978,Pablo Palazuelo,Spanish,Male,1916.0,2007.0
2,Tailpiece (page 55) from SAGESSE,Prints & Illustrated Books,1900.0,Maurice Denis,French,Male,1870.0,1943.0
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Prints & Illustrated Books,1933.5,Aristide Maillol,French,Male,1861.0,1944.0
4,97 rue du Bac,Photography,1903,Eugène Atget,French,Male,1857.0,1927.0
...,...,...,...,...,...,...,...,...
16724,Oval with Points,Painting & Sculpture,1968.5,Henry Moore,British,Male,1898.0,1986.0
16725,"Cementerio de la Ciudad Abierta, Ritoque, Chile",Architecture & Design,1975,Juan Baixas,Chilean,Male,1942.0,
16726,The Catboat,Prints & Illustrated Books,1922,Edward Hopper,American,Male,1882.0,1967.0
16727,Dognat' i peregnat' v tekhniko-ekonomicheskom ...,Prints & Illustrated Books,1931,Unknown,,,,


In [20]:
# Datatype conversion of numeric columns
Gallery["Birth"] = Gallery["Birth"].astype("Int64")
Gallery["Death"] = Gallery["Death"].astype("Int64")
Gallery["Submission_Date"] = Gallery["Submission_Date"].astype(int)

In [21]:
Gallery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16729 entries, 0 to 16728
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Art_Title        16728 non-null  object
 1   Department       16729 non-null  object
 2   Submission_Date  16729 non-null  int32 
 3   Artist           16729 non-null  object
 4   Nationality      16729 non-null  object
 5   Gender           16729 non-null  object
 6   Birth            15787 non-null  Int64 
 7   Death            10475 non-null  Int64 
dtypes: Int64(2), int32(1), object(5)
memory usage: 1013.0+ KB


In [22]:
#Finding the null value
Gallery["Art_Title"][Gallery["Art_Title"].isnull()].index

Index([13316], dtype='int64')

In [23]:
Gallery.iloc[13316]

Art_Title                        NaN
Department         Fluxus Collection
Submission_Date                 1963
Artist                 Arthur Köpcke
Nationality                   German
Gender                          Male
Birth                           1928
Death                           1977
Name: 13316, dtype: object

In [24]:
#Filling the null value
Gallery.at[13316, 'Art_Title'] = 'Unknown'

In [25]:
Gallery.iloc[13316]

Art_Title                    Unknown
Department         Fluxus Collection
Submission_Date                 1963
Artist                 Arthur Köpcke
Nationality                   German
Gender                          Male
Birth                           1928
Death                           1977
Name: 13316, dtype: object

In [26]:
Gallery['Birth'] = Gallery['Birth'].fillna(Gallery['Birth'].median())
Gallery['Death'] = Gallery['Death'].fillna(Gallery['Death'].median())
Gallery

Unnamed: 0,Art_Title,Department,Submission_Date,Artist,Nationality,Gender,Birth,Death
0,Dress MacLeod from Tartan Sets,Prints & Illustrated Books,1986,Sarah Charlesworth,American,Female,1947,2013
1,Duplicate of plate from folio 11 verso (supple...,Prints & Illustrated Books,1978,Pablo Palazuelo,Spanish,Male,1916,2007
2,Tailpiece (page 55) from SAGESSE,Prints & Illustrated Books,1900,Maurice Denis,French,Male,1870,1943
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Prints & Illustrated Books,1933,Aristide Maillol,French,Male,1861,1944
4,97 rue du Bac,Photography,1903,Eugène Atget,French,Male,1857,1927
...,...,...,...,...,...,...,...,...
16724,Oval with Points,Painting & Sculpture,1968,Henry Moore,British,Male,1898,1986
16725,"Cementerio de la Ciudad Abierta, Ritoque, Chile",Architecture & Design,1975,Juan Baixas,Chilean,Male,1942,1975
16726,The Catboat,Prints & Illustrated Books,1922,Edward Hopper,American,Male,1882,1967
16727,Dognat' i peregnat' v tekhniko-ekonomicheskom ...,Prints & Illustrated Books,1931,Unknown,,,1915,1975


In [27]:
#Filling the void spaces
for index, row in Gallery.iterrows():
#iterrows() is used to iterate over DataFrame rows as (index, Series) pairs.
    if row["Nationality"] == "":
        Gallery.at[index, "Nationality"] = "Unknown"

for index, row in Gallery.iterrows():
    if row["Gender"] == "":
        Gallery.at[index, "Gender"] = "Unknown"

Gallery

Unnamed: 0,Art_Title,Department,Submission_Date,Artist,Nationality,Gender,Birth,Death
0,Dress MacLeod from Tartan Sets,Prints & Illustrated Books,1986,Sarah Charlesworth,American,Female,1947,2013
1,Duplicate of plate from folio 11 verso (supple...,Prints & Illustrated Books,1978,Pablo Palazuelo,Spanish,Male,1916,2007
2,Tailpiece (page 55) from SAGESSE,Prints & Illustrated Books,1900,Maurice Denis,French,Male,1870,1943
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Prints & Illustrated Books,1933,Aristide Maillol,French,Male,1861,1944
4,97 rue du Bac,Photography,1903,Eugène Atget,French,Male,1857,1927
...,...,...,...,...,...,...,...,...
16724,Oval with Points,Painting & Sculpture,1968,Henry Moore,British,Male,1898,1986
16725,"Cementerio de la Ciudad Abierta, Ritoque, Chile",Architecture & Design,1975,Juan Baixas,Chilean,Male,1942,1975
16726,The Catboat,Prints & Illustrated Books,1922,Edward Hopper,American,Male,1882,1967
16727,Dognat' i peregnat' v tekhniko-ekonomicheskom ...,Prints & Illustrated Books,1931,Unknown,Unknown,Unknown,1915,1975


In [28]:
Gallery.to_csv("AmericanGallery_cleaned.csv",index=False)